From ae2e953fdca791270e80c08d6a830d9aa472a111 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 18 Jul 2019 11:22:14 -0500 Subject: powerpc/rtas: Unexport rtas_online_cpus_mask, rtas_offline_cpus_mask These aren't used by modular code, nor should they be. Fixes: 120496ac2d2d ("powerpc: Bring all threads online prior to migration/hibernation") Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190718162214.5694-1-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 5faf0a64c92b..49159bb38949 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -922,13 +922,11 @@ int rtas_online_cpus_mask(cpumask_var_t cpus) return ret; } -EXPORT_SYMBOL(rtas_online_cpus_mask); int rtas_offline_cpus_mask(cpumask_var_t cpus) { return rtas_cpu_state_change_mask(DOWN, cpus); } -EXPORT_SYMBOL(rtas_offline_cpus_mask); int rtas_ibm_suspend_me(u64 handle) { -- cgit v1.2.3 From 0df3e42167caaf9f8c7b64de3da40a459979afe8 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 3 Jun 2019 15:11:58 -0700 Subject: PCI: rpaphp: Avoid a sometimes-uninitialized warning When building with -Wsometimes-uninitialized, clang warns: drivers/pci/hotplug/rpaphp_core.c:243:14: warning: variable 'fndit' is used uninitialized whenever 'for' loop exits because its condition is false [-Wsometimes-uninitialized] for (j = 0; j < entries; j++) { ^~~~~~~~~~~ drivers/pci/hotplug/rpaphp_core.c:256:6: note: uninitialized use occurs here if (fndit) ^~~~~ drivers/pci/hotplug/rpaphp_core.c:243:14: note: remove the condition if it is always true for (j = 0; j < entries; j++) { ^~~~~~~~~~~ drivers/pci/hotplug/rpaphp_core.c:233:14: note: initialize the variable 'fndit' to silence this warning int j, fndit; ^ = 0 fndit is only used to gate a sprintf call, which can be moved into the loop to simplify the code and eliminate the local variable, which will fix this warning. Fixes: 2fcf3ae508c2 ("hotplug/drc-info: Add code to search ibm,drc-info property") Suggested-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Acked-by: Tyrel Datwyler Acked-by: Joel Savitz Signed-off-by: Michael Ellerman Link: https://github.com/ClangBuiltLinux/linux/issues/504 Link: https://lore.kernel.org/r/20190603221157.58502-1-natechancellor@gmail.com --- drivers/pci/hotplug/rpaphp_core.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index bcd5d357ca23..c3899ee1db99 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -230,7 +230,7 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name, struct of_drc_info drc; const __be32 *value; char cell_drc_name[MAX_DRC_NAME_LEN]; - int j, fndit; + int j; info = of_find_property(dn->parent, "ibm,drc-info", NULL); if (info == NULL) @@ -245,17 +245,13 @@ static int rpaphp_check_drc_props_v2(struct device_node *dn, char *drc_name, /* Should now know end of current entry */ - if (my_index > drc.last_drc_index) - continue; - - fndit = 1; - break; + /* Found it */ + if (my_index <= drc.last_drc_index) { + sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix, + my_index); + break; + } } - /* Found it */ - - if (fndit) - sprintf(cell_drc_name, "%s%d", drc.drc_name_prefix, - my_index); if (((drc_name == NULL) || (drc_name && !strcmp(drc_name, cell_drc_name))) && -- cgit v1.2.3 From 08a456aa643776757e07adfdebe7f7681117d144 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Thu, 9 May 2019 15:11:15 +1000 Subject: powerpc/powernv: Move SCOM access code into powernv platform The powernv platform is the only one that directly accesses SCOMs. Move the support code to platforms/powernv, and get rid of the PPC_SCOM Kconfig option, as SCOM support is always selected when compiling for powernv. This also means that the Kconfig item for CONFIG_SCOM_DEBUGFS will show up in menuconfig in the platform menu, rather than at the root, which is a much better location. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190509051119.7694-1-ajd@linux.ibm.com --- arch/powerpc/include/asm/scom.h | 154 ------------------- arch/powerpc/platforms/powernv/Kconfig | 5 +- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/opal-xscom.c | 3 +- arch/powerpc/platforms/powernv/scom.c | 224 ++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/scom.h | 147 ++++++++++++++++++ arch/powerpc/sysdev/Kconfig | 7 - arch/powerpc/sysdev/Makefile | 2 - arch/powerpc/sysdev/scom.c | 223 --------------------------- 9 files changed, 378 insertions(+), 389 deletions(-) delete mode 100644 arch/powerpc/include/asm/scom.h create mode 100644 arch/powerpc/platforms/powernv/scom.c create mode 100644 arch/powerpc/platforms/powernv/scom.h delete mode 100644 arch/powerpc/sysdev/scom.c diff --git a/arch/powerpc/include/asm/scom.h b/arch/powerpc/include/asm/scom.h deleted file mode 100644 index 08c44396e54a..000000000000 --- a/arch/powerpc/include/asm/scom.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2010 Benjamin Herrenschmidt, IBM Corp - * - * and David Gibson, IBM Corporation. - */ - -#ifndef _ASM_POWERPC_SCOM_H -#define _ASM_POWERPC_SCOM_H - -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ -#ifdef CONFIG_PPC_SCOM - -/* - * The SCOM bus is a sideband bus used for accessing various internal - * registers of the processor or the chipset. The implementation details - * differ between processors and platforms, and the access method as - * well. - * - * This API allows to "map" ranges of SCOM register numbers associated - * with a given SCOM controller. The later must be represented by a - * device node, though some implementations might support NULL if there - * is no possible ambiguity - * - * Then, scom_read/scom_write can be used to accesses registers inside - * that range. The argument passed is a register number relative to - * the beginning of the range mapped. - */ - -typedef void *scom_map_t; - -/* Value for an invalid SCOM map */ -#define SCOM_MAP_INVALID (NULL) - -/* The scom_controller data structure is what the platform passes - * to the core code in scom_init, it provides the actual implementation - * of all the SCOM functions - */ -struct scom_controller { - scom_map_t (*map)(struct device_node *ctrl_dev, u64 reg, u64 count); - void (*unmap)(scom_map_t map); - - int (*read)(scom_map_t map, u64 reg, u64 *value); - int (*write)(scom_map_t map, u64 reg, u64 value); -}; - -extern const struct scom_controller *scom_controller; - -/** - * scom_init - Initialize the SCOM backend, called by the platform - * @controller: The platform SCOM controller - */ -static inline void scom_init(const struct scom_controller *controller) -{ - scom_controller = controller; -} - -/** - * scom_map_ok - Test is a SCOM mapping is successful - * @map: The result of scom_map to test - */ -static inline int scom_map_ok(scom_map_t map) -{ - return map != SCOM_MAP_INVALID; -} - -/** - * scom_map - Map a block of SCOM registers - * @ctrl_dev: Device node of the SCOM controller - * some implementations allow NULL here - * @reg: first SCOM register to map - * @count: Number of SCOM registers to map - */ - -static inline scom_map_t scom_map(struct device_node *ctrl_dev, - u64 reg, u64 count) -{ - return scom_controller->map(ctrl_dev, reg, count); -} - -/** - * scom_find_parent - Find the SCOM controller for a device - * @dev: OF node of the device - * - * This is not meant for general usage, but in combination with - * scom_map() allows to map registers not represented by the - * device own scom-reg property. Useful for applying HW workarounds - * on things not properly represented in the device-tree for example. - */ -struct device_node *scom_find_parent(struct device_node *dev); - - -/** - * scom_map_device - Map a device's block of SCOM registers - * @dev: OF node of the device - * @index: Register bank index (index in "scom-reg" property) - * - * This function will use the device-tree binding for SCOM which - * is to follow "scom-parent" properties until it finds a node with - * a "scom-controller" property to find the controller. It will then - * use the "scom-reg" property which is made of reg/count pairs, - * each of them having a size defined by the controller's #scom-cells - * property - */ -extern scom_map_t scom_map_device(struct device_node *dev, int index); - - -/** - * scom_unmap - Unmap a block of SCOM registers - * @map: Result of scom_map is to be unmapped - */ -static inline void scom_unmap(scom_map_t map) -{ - if (scom_map_ok(map)) - scom_controller->unmap(map); -} - -/** - * scom_read - Read a SCOM register - * @map: Result of scom_map - * @reg: Register index within that map - * @value: Updated with the value read - * - * Returns 0 (success) or a negative error code - */ -static inline int scom_read(scom_map_t map, u64 reg, u64 *value) -{ - int rc; - - rc = scom_controller->read(map, reg, value); - if (rc) - *value = 0xfffffffffffffffful; - return rc; -} - -/** - * scom_write - Write to a SCOM register - * @map: Result of scom_map - * @reg: Register index within that map - * @value: Value to write - * - * Returns 0 (success) or a negative error code - */ -static inline int scom_write(scom_map_t map, u64 reg, u64 value) -{ - return scom_controller->write(map, reg, value); -} - - -#endif /* CONFIG_PPC_SCOM */ -#endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_SCOM_H */ diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 850eee860cf2..938803eab0ad 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -12,7 +12,6 @@ config PPC_POWERNV select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 - select PPC_SCOM select ARCH_RANDOM select CPU_FREQ select PPC_DOORBELL @@ -47,3 +46,7 @@ config PPC_VAS VAS adapters are found in POWER9 based systems. If unsure, say N. + +config SCOM_DEBUGFS + bool "Expose SCOM controllers via debugfs" + depends on DEBUG_FS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index da2e99efbd04..4b1644150135 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -4,12 +4,12 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o +obj-y += opal-xscom.o scom.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o -obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 66430eebe869..3f48ee69928c 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -14,7 +14,8 @@ #include #include #include -#include + +#include "scom.h" /* * We could probably fit that inside the scom_map_t diff --git a/arch/powerpc/platforms/powernv/scom.c b/arch/powerpc/platforms/powernv/scom.c new file mode 100644 index 000000000000..74664e9496b8 --- /dev/null +++ b/arch/powerpc/platforms/powernv/scom.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2010 Benjamin Herrenschmidt, IBM Corp + * + * and David Gibson, IBM Corporation. + */ + +#include +#include +#include +#include +#include +#include + +#include "scom.h" + +const struct scom_controller *scom_controller; +EXPORT_SYMBOL_GPL(scom_controller); + +struct device_node *scom_find_parent(struct device_node *node) +{ + struct device_node *par, *tmp; + const u32 *p; + + for (par = of_node_get(node); par;) { + if (of_get_property(par, "scom-controller", NULL)) + break; + p = of_get_property(par, "scom-parent", NULL); + tmp = par; + if (p == NULL) + par = of_get_parent(par); + else + par = of_find_node_by_phandle(*p); + of_node_put(tmp); + } + return par; +} +EXPORT_SYMBOL_GPL(scom_find_parent); + +scom_map_t scom_map_device(struct device_node *dev, int index) +{ + struct device_node *parent; + unsigned int cells, size; + const __be32 *prop, *sprop; + u64 reg, cnt; + scom_map_t ret; + + parent = scom_find_parent(dev); + + if (parent == NULL) + return NULL; + + /* + * We support "scom-reg" properties for adding scom registers + * to a random device-tree node with an explicit scom-parent + * + * We also support the simple "reg" property if the device is + * a direct child of a scom controller. + * + * In case both exist, "scom-reg" takes precedence. + */ + prop = of_get_property(dev, "scom-reg", &size); + sprop = of_get_property(parent, "#scom-cells", NULL); + if (!prop && parent == dev->parent) { + prop = of_get_property(dev, "reg", &size); + sprop = of_get_property(parent, "#address-cells", NULL); + } + if (!prop) + return NULL; + cells = sprop ? be32_to_cpup(sprop) : 1; + size >>= 2; + + if (index >= (size / (2*cells))) + return NULL; + + reg = of_read_number(&prop[index * cells * 2], cells); + cnt = of_read_number(&prop[index * cells * 2 + cells], cells); + + ret = scom_map(parent, reg, cnt); + of_node_put(parent); + + return ret; +} +EXPORT_SYMBOL_GPL(scom_map_device); + +#ifdef CONFIG_SCOM_DEBUGFS +struct scom_debug_entry { + struct device_node *dn; + struct debugfs_blob_wrapper path; + char name[16]; +}; + +static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_cnt, val; + scom_map_t map; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg = off >> 3; + reg_cnt = count >> 3; + + map = scom_map(ent->dn, reg, reg_cnt); + if (!scom_map_ok(map)) + return -ENXIO; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = scom_read(map, reg, &val); + if (!rc) + rc = put_user(val, ubuf64); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + *ppos += 8; + done += 8; + } + scom_unmap(map); + return done; +} + +static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_cnt, val; + scom_map_t map; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg = off >> 3; + reg_cnt = count >> 3; + + map = scom_map(ent->dn, reg, reg_cnt); + if (!scom_map_ok(map)) + return -ENXIO; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = get_user(val, ubuf64); + if (!rc) + rc = scom_write(map, reg, val); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + done += 8; + } + scom_unmap(map); + return done; +} + +static const struct file_operations scom_debug_fops = { + .read = scom_debug_read, + .write = scom_debug_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static int scom_debug_init_one(struct dentry *root, struct device_node *dn, + int i) +{ + struct scom_debug_entry *ent; + struct dentry *dir; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->dn = of_node_get(dn); + snprintf(ent->name, 16, "%08x", i); + ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn); + ent->path.size = strlen((char *)ent->path.data); + + dir = debugfs_create_dir(ent->name, root); + if (!dir) { + of_node_put(dn); + kfree(ent->path.data); + kfree(ent); + return -1; + } + + debugfs_create_blob("devspec", 0400, dir, &ent->path); + debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); + + return 0; +} + +static int scom_debug_init(void) +{ + struct device_node *dn; + struct dentry *root; + int i, rc; + + root = debugfs_create_dir("scom", powerpc_debugfs_root); + if (!root) + return -1; + + i = rc = 0; + for_each_node_with_property(dn, "scom-controller") { + int id = of_get_ibm_chip_id(dn); + if (id == -1) + id = i; + rc |= scom_debug_init_one(root, dn, id); + i++; + } + + return rc; +} +device_initcall(scom_debug_init); +#endif /* CONFIG_SCOM_DEBUGFS */ diff --git a/arch/powerpc/platforms/powernv/scom.h b/arch/powerpc/platforms/powernv/scom.h new file mode 100644 index 000000000000..b5c33e30d39c --- /dev/null +++ b/arch/powerpc/platforms/powernv/scom.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2010 Benjamin Herrenschmidt, IBM Corp + * + * and David Gibson, IBM Corporation. + */ + +#ifndef _SCOM_H +#define _SCOM_H + +/* + * The SCOM bus is a sideband bus used for accessing various internal + * registers of the processor or the chipset. The implementation details + * differ between processors and platforms, and the access method as + * well. + * + * This API allows to "map" ranges of SCOM register numbers associated + * with a given SCOM controller. The later must be represented by a + * device node, though some implementations might support NULL if there + * is no possible ambiguity + * + * Then, scom_read/scom_write can be used to accesses registers inside + * that range. The argument passed is a register number relative to + * the beginning of the range mapped. + */ + +typedef void *scom_map_t; + +/* Value for an invalid SCOM map */ +#define SCOM_MAP_INVALID (NULL) + +/* The scom_controller data structure is what the platform passes + * to the core code in scom_init, it provides the actual implementation + * of all the SCOM functions + */ +struct scom_controller { + scom_map_t (*map)(struct device_node *ctrl_dev, u64 reg, u64 count); + void (*unmap)(scom_map_t map); + + int (*read)(scom_map_t map, u64 reg, u64 *value); + int (*write)(scom_map_t map, u64 reg, u64 value); +}; + +extern const struct scom_controller *scom_controller; + +/** + * scom_init - Initialize the SCOM backend, called by the platform + * @controller: The platform SCOM controller + */ +static inline void scom_init(const struct scom_controller *controller) +{ + scom_controller = controller; +} + +/** + * scom_map_ok - Test is a SCOM mapping is successful + * @map: The result of scom_map to test + */ +static inline int scom_map_ok(scom_map_t map) +{ + return map != SCOM_MAP_INVALID; +} + +/** + * scom_map - Map a block of SCOM registers + * @ctrl_dev: Device node of the SCOM controller + * some implementations allow NULL here + * @reg: first SCOM register to map + * @count: Number of SCOM registers to map + */ + +static inline scom_map_t scom_map(struct device_node *ctrl_dev, + u64 reg, u64 count) +{ + return scom_controller->map(ctrl_dev, reg, count); +} + +/** + * scom_find_parent - Find the SCOM controller for a device + * @dev: OF node of the device + * + * This is not meant for general usage, but in combination with + * scom_map() allows to map registers not represented by the + * device own scom-reg property. Useful for applying HW workarounds + * on things not properly represented in the device-tree for example. + */ +struct device_node *scom_find_parent(struct device_node *dev); + + +/** + * scom_map_device - Map a device's block of SCOM registers + * @dev: OF node of the device + * @index: Register bank index (index in "scom-reg" property) + * + * This function will use the device-tree binding for SCOM which + * is to follow "scom-parent" properties until it finds a node with + * a "scom-controller" property to find the controller. It will then + * use the "scom-reg" property which is made of reg/count pairs, + * each of them having a size defined by the controller's #scom-cells + * property + */ +extern scom_map_t scom_map_device(struct device_node *dev, int index); + + +/** + * scom_unmap - Unmap a block of SCOM registers + * @map: Result of scom_map is to be unmapped + */ +static inline void scom_unmap(scom_map_t map) +{ + if (scom_map_ok(map)) + scom_controller->unmap(map); +} + +/** + * scom_read - Read a SCOM register + * @map: Result of scom_map + * @reg: Register index within that map + * @value: Updated with the value read + * + * Returns 0 (success) or a negative error code + */ +static inline int scom_read(scom_map_t map, u64 reg, u64 *value) +{ + int rc; + + rc = scom_controller->read(map, reg, value); + if (rc) + *value = 0xfffffffffffffffful; + return rc; +} + +/** + * scom_write - Write to a SCOM register + * @map: Result of scom_map + * @reg: Register index within that map + * @value: Value to write + * + * Returns 0 (success) or a negative error code + */ +static inline int scom_write(scom_map_t map, u64 reg, u64 value) +{ + return scom_controller->write(map, reg, value); +} + + +#endif /* _SCOM_H */ diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig index d23288c4abf6..9ebcc1337560 100644 --- a/arch/powerpc/sysdev/Kconfig +++ b/arch/powerpc/sysdev/Kconfig @@ -28,13 +28,6 @@ config PPC_MSI_BITMAP source "arch/powerpc/sysdev/xics/Kconfig" source "arch/powerpc/sysdev/xive/Kconfig" -config PPC_SCOM - bool - -config SCOM_DEBUGFS - bool "Expose SCOM controllers via debugfs" - depends on PPC_SCOM && DEBUG_FS - config GE_FPGA bool diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index 9d73dfddf060..603b3c656d19 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -49,8 +49,6 @@ ifdef CONFIG_SUSPEND obj-$(CONFIG_PPC_BOOK3S_32) += 6xx-suspend.o endif -obj-$(CONFIG_PPC_SCOM) += scom.o - obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o obj-$(CONFIG_PPC_XICS) += xics/ diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c deleted file mode 100644 index 94e885bf3aee..000000000000 --- a/arch/powerpc/sysdev/scom.c +++ /dev/null @@ -1,223 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2010 Benjamin Herrenschmidt, IBM Corp - * - * and David Gibson, IBM Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include - -const struct scom_controller *scom_controller; -EXPORT_SYMBOL_GPL(scom_controller); - -struct device_node *scom_find_parent(struct device_node *node) -{ - struct device_node *par, *tmp; - const u32 *p; - - for (par = of_node_get(node); par;) { - if (of_get_property(par, "scom-controller", NULL)) - break; - p = of_get_property(par, "scom-parent", NULL); - tmp = par; - if (p == NULL) - par = of_get_parent(par); - else - par = of_find_node_by_phandle(*p); - of_node_put(tmp); - } - return par; -} -EXPORT_SYMBOL_GPL(scom_find_parent); - -scom_map_t scom_map_device(struct device_node *dev, int index) -{ - struct device_node *parent; - unsigned int cells, size; - const __be32 *prop, *sprop; - u64 reg, cnt; - scom_map_t ret; - - parent = scom_find_parent(dev); - - if (parent == NULL) - return NULL; - - /* - * We support "scom-reg" properties for adding scom registers - * to a random device-tree node with an explicit scom-parent - * - * We also support the simple "reg" property if the device is - * a direct child of a scom controller. - * - * In case both exist, "scom-reg" takes precedence. - */ - prop = of_get_property(dev, "scom-reg", &size); - sprop = of_get_property(parent, "#scom-cells", NULL); - if (!prop && parent == dev->parent) { - prop = of_get_property(dev, "reg", &size); - sprop = of_get_property(parent, "#address-cells", NULL); - } - if (!prop) - return NULL; - cells = sprop ? be32_to_cpup(sprop) : 1; - size >>= 2; - - if (index >= (size / (2*cells))) - return NULL; - - reg = of_read_number(&prop[index * cells * 2], cells); - cnt = of_read_number(&prop[index * cells * 2 + cells], cells); - - ret = scom_map(parent, reg, cnt); - of_node_put(parent); - - return ret; -} -EXPORT_SYMBOL_GPL(scom_map_device); - -#ifdef CONFIG_SCOM_DEBUGFS -struct scom_debug_entry { - struct device_node *dn; - struct debugfs_blob_wrapper path; - char name[16]; -}; - -static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct scom_debug_entry *ent = filp->private_data; - u64 __user *ubuf64 = (u64 __user *)ubuf; - loff_t off = *ppos; - ssize_t done = 0; - u64 reg, reg_cnt, val; - scom_map_t map; - int rc; - - if (off < 0 || (off & 7) || (count & 7)) - return -EINVAL; - reg = off >> 3; - reg_cnt = count >> 3; - - map = scom_map(ent->dn, reg, reg_cnt); - if (!scom_map_ok(map)) - return -ENXIO; - - for (reg = 0; reg < reg_cnt; reg++) { - rc = scom_read(map, reg, &val); - if (!rc) - rc = put_user(val, ubuf64); - if (rc) { - if (!done) - done = rc; - break; - } - ubuf64++; - *ppos += 8; - done += 8; - } - scom_unmap(map); - return done; -} - -static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct scom_debug_entry *ent = filp->private_data; - u64 __user *ubuf64 = (u64 __user *)ubuf; - loff_t off = *ppos; - ssize_t done = 0; - u64 reg, reg_cnt, val; - scom_map_t map; - int rc; - - if (off < 0 || (off & 7) || (count & 7)) - return -EINVAL; - reg = off >> 3; - reg_cnt = count >> 3; - - map = scom_map(ent->dn, reg, reg_cnt); - if (!scom_map_ok(map)) - return -ENXIO; - - for (reg = 0; reg < reg_cnt; reg++) { - rc = get_user(val, ubuf64); - if (!rc) - rc = scom_write(map, reg, val); - if (rc) { - if (!done) - done = rc; - break; - } - ubuf64++; - done += 8; - } - scom_unmap(map); - return done; -} - -static const struct file_operations scom_debug_fops = { - .read = scom_debug_read, - .write = scom_debug_write, - .open = simple_open, - .llseek = default_llseek, -}; - -static int scom_debug_init_one(struct dentry *root, struct device_node *dn, - int i) -{ - struct scom_debug_entry *ent; - struct dentry *dir; - - ent = kzalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - return -ENOMEM; - - ent->dn = of_node_get(dn); - snprintf(ent->name, 16, "%08x", i); - ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn); - ent->path.size = strlen((char *)ent->path.data); - - dir = debugfs_create_dir(ent->name, root); - if (!dir) { - of_node_put(dn); - kfree(ent->path.data); - kfree(ent); - return -1; - } - - debugfs_create_blob("devspec", 0400, dir, &ent->path); - debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); - - return 0; -} - -static int scom_debug_init(void) -{ - struct device_node *dn; - struct dentry *root; - int i, rc; - - root = debugfs_create_dir("scom", powerpc_debugfs_root); - if (!root) - return -1; - - i = rc = 0; - for_each_node_with_property(dn, "scom-controller") { - int id = of_get_ibm_chip_id(dn); - if (id == -1) - id = i; - rc |= scom_debug_init_one(root, dn, id); - i++; - } - - return rc; -} -device_initcall(scom_debug_init); -#endif /* CONFIG_SCOM_DEBUGFS */ -- cgit v1.2.3 From 9edc6b919d9fbec694dc37ea14f9587be1b354ec Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Thu, 9 May 2019 15:11:16 +1000 Subject: powerpc/powernv: Remove dead SCOM access code Nothing is using scom_map_device() or scom_find_parent(). Remove them. Also don't export scom_controller, there are no other users of it. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190509051119.7694-2-ajd@linux.ibm.com --- arch/powerpc/platforms/powernv/scom.c | 67 ----------------------------------- arch/powerpc/platforms/powernv/scom.h | 27 -------------- 2 files changed, 94 deletions(-) diff --git a/arch/powerpc/platforms/powernv/scom.c b/arch/powerpc/platforms/powernv/scom.c index 74664e9496b8..acd6968d9277 100644 --- a/arch/powerpc/platforms/powernv/scom.c +++ b/arch/powerpc/platforms/powernv/scom.c @@ -15,73 +15,6 @@ #include "scom.h" const struct scom_controller *scom_controller; -EXPORT_SYMBOL_GPL(scom_controller); - -struct device_node *scom_find_parent(struct device_node *node) -{ - struct device_node *par, *tmp; - const u32 *p; - - for (par = of_node_get(node); par;) { - if (of_get_property(par, "scom-controller", NULL)) - break; - p = of_get_property(par, "scom-parent", NULL); - tmp = par; - if (p == NULL) - par = of_get_parent(par); - else - par = of_find_node_by_phandle(*p); - of_node_put(tmp); - } - return par; -} -EXPORT_SYMBOL_GPL(scom_find_parent); - -scom_map_t scom_map_device(struct device_node *dev, int index) -{ - struct device_node *parent; - unsigned int cells, size; - const __be32 *prop, *sprop; - u64 reg, cnt; - scom_map_t ret; - - parent = scom_find_parent(dev); - - if (parent == NULL) - return NULL; - - /* - * We support "scom-reg" properties for adding scom registers - * to a random device-tree node with an explicit scom-parent - * - * We also support the simple "reg" property if the device is - * a direct child of a scom controller. - * - * In case both exist, "scom-reg" takes precedence. - */ - prop = of_get_property(dev, "scom-reg", &size); - sprop = of_get_property(parent, "#scom-cells", NULL); - if (!prop && parent == dev->parent) { - prop = of_get_property(dev, "reg", &size); - sprop = of_get_property(parent, "#address-cells", NULL); - } - if (!prop) - return NULL; - cells = sprop ? be32_to_cpup(sprop) : 1; - size >>= 2; - - if (index >= (size / (2*cells))) - return NULL; - - reg = of_read_number(&prop[index * cells * 2], cells); - cnt = of_read_number(&prop[index * cells * 2 + cells], cells); - - ret = scom_map(parent, reg, cnt); - of_node_put(parent); - - return ret; -} -EXPORT_SYMBOL_GPL(scom_map_device); #ifdef CONFIG_SCOM_DEBUGFS struct scom_debug_entry { diff --git a/arch/powerpc/platforms/powernv/scom.h b/arch/powerpc/platforms/powernv/scom.h index b5c33e30d39c..9827a66ef147 100644 --- a/arch/powerpc/platforms/powernv/scom.h +++ b/arch/powerpc/platforms/powernv/scom.h @@ -75,33 +75,6 @@ static inline scom_map_t scom_map(struct device_node *ctrl_dev, return scom_controller->map(ctrl_dev, reg, count); } -/** - * scom_find_parent - Find the SCOM controller for a device - * @dev: OF node of the device - * - * This is not meant for general usage, but in combination with - * scom_map() allows to map registers not represented by the - * device own scom-reg property. Useful for applying HW workarounds - * on things not properly represented in the device-tree for example. - */ -struct device_node *scom_find_parent(struct device_node *dev); - - -/** - * scom_map_device - Map a device's block of SCOM registers - * @dev: OF node of the device - * @index: Register bank index (index in "scom-reg" property) - * - * This function will use the device-tree binding for SCOM which - * is to follow "scom-parent" properties until it finds a node with - * a "scom-controller" property to find the controller. It will then - * use the "scom-reg" property which is made of reg/count pairs, - * each of them having a size defined by the controller's #scom-cells - * property - */ -extern scom_map_t scom_map_device(struct device_node *dev, int index); - - /** * scom_unmap - Unmap a block of SCOM registers * @map: Result of scom_map is to be unmapped -- cgit v1.2.3 From bfd2f0d49aef8abfe6bf58f12719f39912993cc6 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Thu, 9 May 2019 15:11:17 +1000 Subject: powerpc/powernv: Get rid of old scom_controller abstraction Once upon a time, the SCOM access code was used by the WSP platform as well as powernv. Thus it made sense to have a generic SCOM access interface to abstract between different platforms. Now that it's just powernv, with no other platforms currently on the horizon, let's rip out scom_controller and make everything much simpler and more direct. While we're here, fix up the comment block at the top. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190509051119.7694-3-ajd@linux.ibm.com --- arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/opal-xscom.c | 214 ++++++++++++++++++---------- arch/powerpc/platforms/powernv/scom.c | 157 -------------------- arch/powerpc/platforms/powernv/scom.h | 120 ---------------- 4 files changed, 142 insertions(+), 351 deletions(-) delete mode 100644 arch/powerpc/platforms/powernv/scom.c delete mode 100644 arch/powerpc/platforms/powernv/scom.h diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 4b1644150135..69a3aefa905b 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -4,7 +4,6 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o -obj-y += opal-xscom.o scom.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o @@ -16,3 +15,4 @@ obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o obj-$(CONFIG_OCXL_BASE) += ocxl.o +obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 3f48ee69928c..cb172af2f53a 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * PowerNV LPC bus handling. + * PowerNV SCOM bus debugfs interface * + * Copyright 2010 Benjamin Herrenschmidt, IBM Corp + * + * and David Gibson, IBM Corporation. * Copyright 2013 IBM Corp. */ @@ -10,63 +13,13 @@ #include #include #include +#include #include #include #include - -#include "scom.h" - -/* - * We could probably fit that inside the scom_map_t - * which is a void* after all but it's really too ugly - * so let's kmalloc it for now - */ -struct opal_scom_map { - uint32_t chip; - uint64_t addr; -}; - -static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) -{ - struct opal_scom_map *m; - const __be32 *gcid; - - if (!of_get_property(dev, "scom-controller", NULL)) { - pr_err("%s: device %pOF is not a SCOM controller\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - gcid = of_get_property(dev, "ibm,chip-id", NULL); - if (!gcid) { - pr_err("%s: device %pOF has no ibm,chip-id\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - m = kmalloc(sizeof(*m), GFP_KERNEL); - if (!m) - return NULL; - m->chip = be32_to_cpup(gcid); - m->addr = reg; - - return (scom_map_t)m; -} - -static void opal_scom_unmap(scom_map_t map) -{ - kfree(map); -} - -static int opal_xscom_err_xlate(int64_t rc) -{ - switch(rc) { - case 0: - return 0; - /* Add more translations if necessary */ - default: - return -EIO; - } -} +#include +#include static u64 opal_scom_unmangle(u64 addr) { @@ -99,39 +52,154 @@ static u64 opal_scom_unmangle(u64 addr) return addr; } -static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value) { - struct opal_scom_map *m = map; int64_t rc; __be64 v; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v)); + if (rc) { + *value = 0xfffffffffffffffful; + return -EIO; + } *value = be64_to_cpu(v); - return opal_xscom_err_xlate(rc); + return 0; } -static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value) { - struct opal_scom_map *m = map; int64_t rc; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_write(m->chip, reg, value); - return opal_xscom_err_xlate(rc); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_write(chip, reg, value); + if (rc) + return -EIO; + return 0; +} + +struct scom_debug_entry { + u32 chip; + struct debugfs_blob_wrapper path; + char name[16]; +}; + +static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = opal_scom_read(ent->chip, reg_base, reg, &val); + if (!rc) + rc = put_user(val, ubuf64); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + *ppos += 8; + done += 8; + } + return done; +} + +static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = get_user(val, ubuf64); + if (!rc) + rc = opal_scom_write(ent->chip, reg_base, reg, val); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + done += 8; + } + return done; } -static const struct scom_controller opal_scom_controller = { - .map = opal_scom_map, - .unmap = opal_scom_unmap, - .read = opal_scom_read, - .write = opal_scom_write +static const struct file_operations scom_debug_fops = { + .read = scom_debug_read, + .write = scom_debug_write, + .open = simple_open, + .llseek = default_llseek, }; -static int opal_xscom_init(void) +static int scom_debug_init_one(struct dentry *root, struct device_node *dn, + int chip) { - if (firmware_has_feature(FW_FEATURE_OPAL)) - scom_init(&opal_scom_controller); + struct scom_debug_entry *ent; + struct dentry *dir; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->chip = chip; + snprintf(ent->name, 16, "%08x", chip); + ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn); + ent->path.size = strlen((char *)ent->path.data); + + dir = debugfs_create_dir(ent->name, root); + if (!dir) { + kfree(ent->path.data); + kfree(ent); + return -1; + } + + debugfs_create_blob("devspec", 0400, dir, &ent->path); + debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); + return 0; } -machine_arch_initcall(powernv, opal_xscom_init); + +static int scom_debug_init(void) +{ + struct device_node *dn; + struct dentry *root; + int chip, rc; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return 0; + + root = debugfs_create_dir("scom", powerpc_debugfs_root); + if (!root) + return -1; + + rc = 0; + for_each_node_with_property(dn, "scom-controller") { + chip = of_get_ibm_chip_id(dn); + WARN_ON(chip == -1); + rc |= scom_debug_init_one(root, dn, chip); + } + + return rc; +} +device_initcall(scom_debug_init); diff --git a/arch/powerpc/platforms/powernv/scom.c b/arch/powerpc/platforms/powernv/scom.c deleted file mode 100644 index acd6968d9277..000000000000 --- a/arch/powerpc/platforms/powernv/scom.c +++ /dev/null @@ -1,157 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright 2010 Benjamin Herrenschmidt, IBM Corp - * - * and David Gibson, IBM Corporation. - */ - -#include -#include -#include -#include -#include -#include - -#include "scom.h" - -const struct scom_controller *scom_controller; - -#ifdef CONFIG_SCOM_DEBUGFS -struct scom_debug_entry { - struct device_node *dn; - struct debugfs_blob_wrapper path; - char name[16]; -}; - -static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct scom_debug_entry *ent = filp->private_data; - u64 __user *ubuf64 = (u64 __user *)ubuf; - loff_t off = *ppos; - ssize_t done = 0; - u64 reg, reg_cnt, val; - scom_map_t map; - int rc; - - if (off < 0 || (off & 7) || (count & 7)) - return -EINVAL; - reg = off >> 3; - reg_cnt = count >> 3; - - map = scom_map(ent->dn, reg, reg_cnt); - if (!scom_map_ok(map)) - return -ENXIO; - - for (reg = 0; reg < reg_cnt; reg++) { - rc = scom_read(map, reg, &val); - if (!rc) - rc = put_user(val, ubuf64); - if (rc) { - if (!done) - done = rc; - break; - } - ubuf64++; - *ppos += 8; - done += 8; - } - scom_unmap(map); - return done; -} - -static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct scom_debug_entry *ent = filp->private_data; - u64 __user *ubuf64 = (u64 __user *)ubuf; - loff_t off = *ppos; - ssize_t done = 0; - u64 reg, reg_cnt, val; - scom_map_t map; - int rc; - - if (off < 0 || (off & 7) || (count & 7)) - return -EINVAL; - reg = off >> 3; - reg_cnt = count >> 3; - - map = scom_map(ent->dn, reg, reg_cnt); - if (!scom_map_ok(map)) - return -ENXIO; - - for (reg = 0; reg < reg_cnt; reg++) { - rc = get_user(val, ubuf64); - if (!rc) - rc = scom_write(map, reg, val); - if (rc) { - if (!done) - done = rc; - break; - } - ubuf64++; - done += 8; - } - scom_unmap(map); - return done; -} - -static const struct file_operations scom_debug_fops = { - .read = scom_debug_read, - .write = scom_debug_write, - .open = simple_open, - .llseek = default_llseek, -}; - -static int scom_debug_init_one(struct dentry *root, struct device_node *dn, - int i) -{ - struct scom_debug_entry *ent; - struct dentry *dir; - - ent = kzalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - return -ENOMEM; - - ent->dn = of_node_get(dn); - snprintf(ent->name, 16, "%08x", i); - ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn); - ent->path.size = strlen((char *)ent->path.data); - - dir = debugfs_create_dir(ent->name, root); - if (!dir) { - of_node_put(dn); - kfree(ent->path.data); - kfree(ent); - return -1; - } - - debugfs_create_blob("devspec", 0400, dir, &ent->path); - debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); - - return 0; -} - -static int scom_debug_init(void) -{ - struct device_node *dn; - struct dentry *root; - int i, rc; - - root = debugfs_create_dir("scom", powerpc_debugfs_root); - if (!root) - return -1; - - i = rc = 0; - for_each_node_with_property(dn, "scom-controller") { - int id = of_get_ibm_chip_id(dn); - if (id == -1) - id = i; - rc |= scom_debug_init_one(root, dn, id); - i++; - } - - return rc; -} -device_initcall(scom_debug_init); -#endif /* CONFIG_SCOM_DEBUGFS */ diff --git a/arch/powerpc/platforms/powernv/scom.h b/arch/powerpc/platforms/powernv/scom.h deleted file mode 100644 index 9827a66ef147..000000000000 --- a/arch/powerpc/platforms/powernv/scom.h +++ /dev/null @@ -1,120 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2010 Benjamin Herrenschmidt, IBM Corp - * - * and David Gibson, IBM Corporation. - */ - -#ifndef _SCOM_H -#define _SCOM_H - -/* - * The SCOM bus is a sideband bus used for accessing various internal - * registers of the processor or the chipset. The implementation details - * differ between processors and platforms, and the access method as - * well. - * - * This API allows to "map" ranges of SCOM register numbers associated - * with a given SCOM controller. The later must be represented by a - * device node, though some implementations might support NULL if there - * is no possible ambiguity - * - * Then, scom_read/scom_write can be used to accesses registers inside - * that range. The argument passed is a register number relative to - * the beginning of the range mapped. - */ - -typedef void *scom_map_t; - -/* Value for an invalid SCOM map */ -#define SCOM_MAP_INVALID (NULL) - -/* The scom_controller data structure is what the platform passes - * to the core code in scom_init, it provides the actual implementation - * of all the SCOM functions - */ -struct scom_controller { - scom_map_t (*map)(struct device_node *ctrl_dev, u64 reg, u64 count); - void (*unmap)(scom_map_t map); - - int (*read)(scom_map_t map, u64 reg, u64 *value); - int (*write)(scom_map_t map, u64 reg, u64 value); -}; - -extern const struct scom_controller *scom_controller; - -/** - * scom_init - Initialize the SCOM backend, called by the platform - * @controller: The platform SCOM controller - */ -static inline void scom_init(const struct scom_controller *controller) -{ - scom_controller = controller; -} - -/** - * scom_map_ok - Test is a SCOM mapping is successful - * @map: The result of scom_map to test - */ -static inline int scom_map_ok(scom_map_t map) -{ - return map != SCOM_MAP_INVALID; -} - -/** - * scom_map - Map a block of SCOM registers - * @ctrl_dev: Device node of the SCOM controller - * some implementations allow NULL here - * @reg: first SCOM register to map - * @count: Number of SCOM registers to map - */ - -static inline scom_map_t scom_map(struct device_node *ctrl_dev, - u64 reg, u64 count) -{ - return scom_controller->map(ctrl_dev, reg, count); -} - -/** - * scom_unmap - Unmap a block of SCOM registers - * @map: Result of scom_map is to be unmapped - */ -static inline void scom_unmap(scom_map_t map) -{ - if (scom_map_ok(map)) - scom_controller->unmap(map); -} - -/** - * scom_read - Read a SCOM register - * @map: Result of scom_map - * @reg: Register index within that map - * @value: Updated with the value read - * - * Returns 0 (success) or a negative error code - */ -static inline int scom_read(scom_map_t map, u64 reg, u64 *value) -{ - int rc; - - rc = scom_controller->read(map, reg, value); - if (rc) - *value = 0xfffffffffffffffful; - return rc; -} - -/** - * scom_write - Write to a SCOM register - * @map: Result of scom_map - * @reg: Register index within that map - * @value: Value to write - * - * Returns 0 (success) or a negative error code - */ -static inline int scom_write(scom_map_t map, u64 reg, u64 value) -{ - return scom_controller->write(map, reg, value); -} - - -#endif /* _SCOM_H */ -- cgit v1.2.3 From 8c98db8d5851db45c50024a29c3dd1779dc4da0b Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Thu, 9 May 2019 15:11:18 +1000 Subject: powerpc/powernv: Fix checkpatch warnings in opal-xscom.c Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190509051119.7694-4-ajd@linux.ibm.com --- arch/powerpc/platforms/powernv/opal-xscom.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index cb172af2f53a..fd510d961b8c 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -90,7 +90,7 @@ static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, struct scom_debug_entry *ent = filp->private_data; u64 __user *ubuf64 = (u64 __user *)ubuf; loff_t off = *ppos; - ssize_t done = 0; + ssize_t done = 0; u64 reg, reg_base, reg_cnt, val; int rc; @@ -115,13 +115,13 @@ static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, return done; } -static ssize_t scom_debug_write(struct file* filp, const char __user *ubuf, +static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { struct scom_debug_entry *ent = filp->private_data; u64 __user *ubuf64 = (u64 __user *)ubuf; loff_t off = *ppos; - ssize_t done = 0; + ssize_t done = 0; u64 reg, reg_base, reg_cnt, val; int rc; @@ -164,7 +164,7 @@ static int scom_debug_init_one(struct dentry *root, struct device_node *dn, ent->chip = chip; snprintf(ent->name, 16, "%08x", chip); - ent->path.data = (void*)kasprintf(GFP_KERNEL, "%pOF", dn); + ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn); ent->path.size = strlen((char *)ent->path.data); dir = debugfs_create_dir(ent->name, root); -- cgit v1.2.3 From 8b856a0942a1b4d832966985fcdf1a455eb6ab8c Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Thu, 9 May 2019 15:11:19 +1000 Subject: powerpc/configs: Disable SCOM_DEBUGFS in powernv_defconfig SCOM_DEBUGFS is really not needed for anything other than low-level hardware debugging. mpe: It also introduces a large and poorly documented/understood attack surface. Although the interface is only available to root, the kernel still aspires to restrict root to accessing hardware through well defined interfaces, which this is not. opal-prd uses its own interface (/dev/prd) for SCOM access, so it doesn't need SCOM_DEBUGFS. At some point in the future we'll introduce a debug config fragment where this can go instead. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190509051119.7694-5-ajd@linux.ibm.com --- arch/powerpc/configs/powernv_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 34219d555e8a..6658cceb928c 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -38,7 +38,7 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_PARTITION_ADVANCED=y -CONFIG_SCOM_DEBUGFS=y +# CONFIG_SCOM_DEBUGFS is not set CONFIG_OPAL_PRD=y CONFIG_PPC_MEMTRACE=y # CONFIG_PPC_PSERIES is not set -- cgit v1.2.3 From e7de4f7b64c23e503a8c42af98d56f2a7462bd6d Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 3 May 2019 17:52:53 +1000 Subject: powerpc/powernv: Restrict OPAL symbol map to only be readable by root Currently the OPAL symbol map is globally readable, which seems bad as it contains physical addresses. Restrict it to root. Fixes: c8742f85125d ("powerpc/powernv: Expose OPAL firmware symbol map") Cc: stable@vger.kernel.org # v3.19+ Suggested-by: Michael Ellerman Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190503075253.22798-1-ajd@linux.ibm.com --- arch/powerpc/platforms/powernv/opal.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index aba443be7daa..d271accf224b 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -705,7 +705,10 @@ static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, bin_attr->size); } -static BIN_ATTR_RO(symbol_map, 0); +static struct bin_attribute symbol_map_attr = { + .attr = {.name = "symbol_map", .mode = 0400}, + .read = symbol_map_read +}; static void opal_export_symmap(void) { @@ -722,10 +725,10 @@ static void opal_export_symmap(void) return; /* Setup attributes */ - bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); - bin_attr_symbol_map.size = be64_to_cpu(syms[1]); + symbol_map_attr.private = __va(be64_to_cpu(syms[0])); + symbol_map_attr.size = be64_to_cpu(syms[1]); - rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); + rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr); if (rc) pr_warn("Error %d creating OPAL symbols file\n", rc); } -- cgit v1.2.3 From 461cef2a676e7c578d0ef62969dbcb8237c8631d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 31 Jul 2019 06:31:41 +0000 Subject: powerpc/32: activate ARCH_HAS_PMEM_API and ARCH_HAS_UACCESS_FLUSHCACHE PPC32 also have flush_dcache_range() so it can also support ARCH_HAS_PMEM_API and ARCH_HAS_UACCESS_FLUSHCACHE without changes. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a682a2f9db308c5cfe77e45aa3352e41bc9f4e33.1564554634.git.christophe.leroy@c-s.fr --- arch/powerpc/Kconfig | 4 ++-- arch/powerpc/lib/Makefile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 77f6ebf97113..9bc8ac5b8c96 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,14 +129,14 @@ config PPC select ARCH_HAS_HUGEPD if HUGETLB_PAGE select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA - select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PMEM_API select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC64 select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_UACCESS_FLUSHCACHE if PPC64 + select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index eebc782d89a5..f030eea2171d 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING endif -obj-y += alloc.o code-patching.o feature-fixups.o +obj-y += alloc.o code-patching.o feature-fixups.o pmem.o ifndef CONFIG_KASAN obj-y += string.o memcmp_$(BITS).o @@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ memcpy_power7.o obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ - memcpy_64.o pmem.o + memcpy_64.o obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o -- cgit v1.2.3 From 1ebe0dcce1750109181d666394b7dfd9af9ff645 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Fri, 2 Aug 2019 10:08:35 +1000 Subject: powerpc/xive: Update comment referencing magic loads from an ESB The comment above xive_esb_read() references magic loads from an ESB as described xive.h. This has been inaccurate since commit 12c1f339cd49 ("powerpc/xive: Move definition of ESB bits") which moved the description. Update the comment to reference the new location of the description in xive-regs.h Signed-off-by: Jordan Niethe Acked-by: Stewart Smith Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802000835.26191-1-jniethe5@gmail.com --- arch/powerpc/sysdev/xive/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 1cdb39575eae..083f657091d7 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -185,7 +185,7 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) /* * This is used to perform the magic loads from an ESB - * described in xive.h + * described in xive-regs.h */ static notrace u8 xive_esb_read(struct xive_irq_data *xd, u32 offset) { -- cgit v1.2.3 From 9616dda1aaddb2080122aaeab96ad7fc064e36b4 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Thu, 1 Aug 2019 19:52:51 -0300 Subject: powerpc/pseries/hotplug-memory.c: Replace nested ifs by switch-case I noticed these nested ifs can be easily replaced by switch-cases, which can improve readability. Signed-off-by: Leonardo Bras Reviewed-by: David Hildenbrand Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190801225251.17864-1-leonardo@linux.ibm.com --- arch/powerpc/platforms/pseries/hotplug-memory.c | 26 +++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 46d0d35b9ca4..8e700390f3d6 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -880,34 +880,44 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) switch (hp_elog->action) { case PSERIES_HP_ELOG_ACTION_ADD: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_add_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_add_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_add_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; case PSERIES_HP_ELOG_ACTION_REMOVE: - if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) { + switch (hp_elog->id_type) { + case PSERIES_HP_ELOG_ID_DRC_COUNT: count = hp_elog->_drc_u.drc_count; rc = dlpar_memory_remove_by_count(count); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { + break; + case PSERIES_HP_ELOG_ID_DRC_INDEX: drc_index = hp_elog->_drc_u.drc_index; rc = dlpar_memory_remove_by_index(drc_index); - } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_IC) { + break; + case PSERIES_HP_ELOG_ID_DRC_IC: count = hp_elog->_drc_u.ic.count; drc_index = hp_elog->_drc_u.ic.index; rc = dlpar_memory_remove_by_ic(count, drc_index); - } else { + break; + default: rc = -EINVAL; + break; } break; -- cgit v1.2.3 From 0c9c1d56397518eb823d458b00b06bcccd956794 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:14 -0300 Subject: x86, s390: Move ARCH_HAS_MEM_ENCRYPT definition to arch/Kconfig powerpc is also going to use this feature, so put it in a generic location. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-2-bauerman@linux.ibm.com --- arch/Kconfig | 3 +++ arch/s390/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index a7b57dd42c26..89e2e3f64f79 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -925,6 +925,9 @@ config LOCK_EVENT_COUNTS the chance of application behavior change because of timing differences. The counts are reported via debugfs. +config ARCH_HAS_MEM_ENCRYPT + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a4ad2733eedf..f43319c44454 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -config ARCH_HAS_MEM_ENCRYPT - def_bool y - config MMU def_bool y @@ -68,6 +65,7 @@ config S390 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 222855cc0158..06027809c599 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -68,6 +68,7 @@ config X86 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 + select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 @@ -1518,9 +1519,6 @@ config X86_CPA_STATISTICS helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. -config ARCH_HAS_MEM_ENCRYPT - def_bool y - config AMD_MEM_ENCRYPT bool "AMD Secure Memory Encryption (SME) support" depends on X86_64 && CPU_SUP_AMD -- cgit v1.2.3 From 47e5d8f9ed34a5310f45f48bbc15f1e61d83b6db Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:15 -0300 Subject: swiotlb: Remove call to sme_active() sme_active() is an x86-specific function so it's better not to call it from generic code. There's no need to mention which memory encryption feature is active, so just use a more generic message. Besides, other architectures will have different names for similar technology. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Christoph Hellwig Reviewed-by: Tom Lendacky Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-3-bauerman@linux.ibm.com --- kernel/dma/swiotlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9de232229063..f29caad71e13 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -461,8 +461,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (mem_encrypt_active()) - pr_warn_once("%s is active and system is using DMA bounce buffers\n", - sme_active() ? "SME" : "SEV"); + pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); mask = dma_get_seg_boundary(hwdev); -- cgit v1.2.3 From e740815a97e2b6d6446792f4328378e66de166d1 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:16 -0300 Subject: dma-mapping: Remove dma_check_mask() sme_active() is an x86-specific function so it's better not to call it from generic code. Christoph Hellwig mentioned that "There is no reason why we should have a special debug printk just for one specific reason why there is a requirement for a large DMA mask.", so just remove dma_check_mask(). Signed-off-by: Thiago Jung Bauermann Reviewed-by: Christoph Hellwig Reviewed-by: Tom Lendacky Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-4-bauerman@linux.ibm.com --- kernel/dma/mapping.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 1f628e7ac709..61eeefbfcb36 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -291,12 +291,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); -static inline void dma_check_mask(struct device *dev, u64 mask) -{ - if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) - dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); -} - int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -327,7 +321,6 @@ int dma_set_mask(struct device *dev, u64 mask) return -EIO; arch_dma_set_mask(dev, mask); - dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } @@ -345,7 +338,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) if (!dma_supported(dev, mask)) return -EIO; - dma_check_mask(dev, mask); dev->coherent_dma_mask = mask; return 0; } -- cgit v1.2.3 From 284e21fab2cfcf90dacce565e0b12f29e5df00c1 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:17 -0300 Subject: x86, s390/mm: Move sme_active() and sme_me_mask to x86-specific header Now that generic code doesn't reference them, move sme_active() and sme_me_mask to x86's . Also remove the export for sme_active() since it's only used in files that won't be built as modules. sme_me_mask on the other hand is used in arch/x86/kvm/svm.c (via __sme_set() and __psp_pa()) which can be built as a module so its export needs to stay. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Christoph Hellwig Reviewed-by: Tom Lendacky Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-5-bauerman@linux.ibm.com --- arch/s390/include/asm/mem_encrypt.h | 4 +--- arch/x86/include/asm/mem_encrypt.h | 10 ++++++++++ arch/x86/mm/mem_encrypt.c | 1 - include/linux/mem_encrypt.h | 14 +------------- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h index 3eb018508190..ff813a56bc30 100644 --- a/arch/s390/include/asm/mem_encrypt.h +++ b/arch/s390/include/asm/mem_encrypt.h @@ -4,9 +4,7 @@ #ifndef __ASSEMBLY__ -#define sme_me_mask 0ULL - -static inline bool sme_active(void) { return false; } +static inline bool mem_encrypt_active(void) { return false; } extern bool sev_active(void); int set_memory_encrypted(unsigned long addr, int numpages); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 0c196c47d621..848ce43b9040 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -92,6 +92,16 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; +static inline bool mem_encrypt_active(void) +{ + return sme_me_mask; +} + +static inline u64 sme_get_me_mask(void) +{ + return sme_me_mask; +} + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index fece30ca8b0c..94da5a88abe6 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -344,7 +344,6 @@ bool sme_active(void) { return sme_me_mask && !sev_enabled; } -EXPORT_SYMBOL(sme_active); bool sev_active(void) { diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 470bd53a89df..0c5b0ff9eb29 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -18,23 +18,11 @@ #else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ -#define sme_me_mask 0ULL - -static inline bool sme_active(void) { return false; } +static inline bool mem_encrypt_active(void) { return false; } static inline bool sev_active(void) { return false; } #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ -static inline bool mem_encrypt_active(void) -{ - return sme_me_mask; -} - -static inline u64 sme_get_me_mask(void) -{ - return sme_me_mask; -} - #ifdef CONFIG_AMD_MEM_ENCRYPT /* * The __sme_set() and __sme_clr() macros are useful for adding or removing -- cgit v1.2.3 From ae7eb82a92fae4d255a0caa9f7c0f99e3babfec1 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:18 -0300 Subject: fs/core/vmcore: Move sev_active() reference to x86 arch code Secure Encrypted Virtualization is an x86-specific feature, so it shouldn't appear in generic kernel code because it forces non-x86 architectures to define the sev_active() function, which doesn't make a lot of sense. To solve this problem, add an x86 elfcorehdr_read() function to override the generic weak implementation. To do that, it's necessary to make read_from_oldmem() public so that it can be used outside of vmcore.c. Also, remove the export for sev_active() since it's only used in files that won't be built as modules. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Christoph Hellwig Reviewed-by: Lianbo Jiang Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-6-bauerman@linux.ibm.com --- arch/x86/kernel/crash_dump_64.c | 5 +++++ arch/x86/mm/mem_encrypt.c | 1 - fs/proc/vmcore.c | 8 ++++---- include/linux/crash_dump.h | 14 ++++++++++++++ include/linux/mem_encrypt.h | 1 - 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 22369dd5de3b..045e82e8945b 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -70,3 +70,8 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, { return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); } + +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0, sev_active()); +} diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 94da5a88abe6..9268c12458c8 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -349,7 +349,6 @@ bool sev_active(void) { return sme_me_mask && sev_enabled; } -EXPORT_SYMBOL(sev_active); /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7bcc92add72c..7b13988796e1 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -104,9 +104,9 @@ static int pfn_is_ram(unsigned long pfn) } /* Reads a page from the oldmem device from given offset. */ -static ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +ssize_t read_from_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf, + bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -170,7 +170,7 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, sev_active()); + return read_from_oldmem(buf, count, ppos, 0, false); } /* diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index f774c5eb9e3c..4664fc1871de 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -115,4 +115,18 @@ static inline int vmcore_add_device_dump(struct vmcoredd_data *data) return -EOPNOTSUPP; } #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +#ifdef CONFIG_PROC_VMCORE +ssize_t read_from_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf, + bool encrypted); +#else +static inline ssize_t read_from_oldmem(char *buf, size_t count, + u64 *ppos, int userbuf, + bool encrypted) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_PROC_VMCORE */ + #endif /* LINUX_CRASHDUMP_H */ diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 0c5b0ff9eb29..5c4a18a91f89 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -19,7 +19,6 @@ #else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ static inline bool mem_encrypt_active(void) { return false; } -static inline bool sev_active(void) { return false; } #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ -- cgit v1.2.3 From 5cbdaeefb655072d304744812708b3f3a31c6b51 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 6 Aug 2019 01:49:19 -0300 Subject: s390/mm: Remove sev_active() function All references to sev_active() were moved to arch/x86 so we don't need to define it for s390 anymore. Signed-off-by: Thiago Jung Bauermann Reviewed-by: Christoph Hellwig Reviewed-by: Halil Pasic Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806044919.10622-7-bauerman@linux.ibm.com --- arch/s390/include/asm/mem_encrypt.h | 1 - arch/s390/mm/init.c | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/s390/include/asm/mem_encrypt.h b/arch/s390/include/asm/mem_encrypt.h index ff813a56bc30..2542cbf7e2d1 100644 --- a/arch/s390/include/asm/mem_encrypt.h +++ b/arch/s390/include/asm/mem_encrypt.h @@ -5,7 +5,6 @@ #ifndef __ASSEMBLY__ static inline bool mem_encrypt_active(void) { return false; } -extern bool sev_active(void); int set_memory_encrypted(unsigned long addr, int numpages); int set_memory_decrypted(unsigned long addr, int numpages); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 20340a03ad90..a124f19f7b3c 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -156,14 +156,9 @@ int set_memory_decrypted(unsigned long addr, int numpages) } /* are we a protected virtualization guest? */ -bool sev_active(void) -{ - return is_prot_virt_guest(); -} - bool force_dma_unencrypted(struct device *dev) { - return sev_active(); + return is_prot_virt_guest(); } /* protected virtualization */ -- cgit v1.2.3 From b214a8f2ea964b5ff2cc4a1e8bd8c74f64741f20 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 1 Aug 2019 10:32:31 +0200 Subject: powerpc/xive: Use GFP_KERNEL instead of GFP_ATOMIC in 'xive_irq_bitmap_add()' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to use GFP_ATOMIC here. GFP_KERNEL should be enough. GFP_KERNEL is also already used for another allocation just a few lines below. Signed-off-by: Christophe JAILLET Reviewed-by: Cédric Le Goater Reviewed-by: Greg Kurz Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/85d5d247ce753befd6aa63c473f7823de6520ccd.1564647619.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/sysdev/xive/spapr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index 8ef9cf4ebb1c..b4f5eb9e0f82 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -45,7 +45,7 @@ static int xive_irq_bitmap_add(int base, int count) { struct xive_irq_bitmap *xibm; - xibm = kzalloc(sizeof(*xibm), GFP_ATOMIC); + xibm = kzalloc(sizeof(*xibm), GFP_KERNEL); if (!xibm) return -ENOMEM; -- cgit v1.2.3 From fd3806562f450a6189c31e0d2cb9cd4b208dcf2d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 1 Aug 2019 10:32:42 +0200 Subject: powerpc/xive: Add a check for memory allocation failure The result of this kzalloc is not checked. Add a check and corresponding error handling code. Signed-off-by: Christophe JAILLET Reviewed-by: Greg Kurz Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cc53462734dfeaf15b6bad0e626b483de18656b4.1564647619.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/sysdev/xive/spapr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index b4f5eb9e0f82..52198131c75e 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -53,6 +53,10 @@ static int xive_irq_bitmap_add(int base, int count) xibm->base = base; xibm->count = count; xibm->bitmap = kzalloc(xibm->count, GFP_KERNEL); + if (!xibm->bitmap) { + kfree(xibm); + return -ENOMEM; + } list_add(&xibm->list, &xive_irq_bitmaps); pr_info("Using IRQ range [%x-%x]", xibm->base, -- cgit v1.2.3 From 2b87a2553aa04105724d6c844e223415985246ed Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 16 May 2019 12:04:37 +1000 Subject: powerpc/64s: Make boot look nice(r) Radix boot looks like this: ----------------------------------------------------- phys_mem_size = 0x200000000 dcache_bsize = 0x80 icache_bsize = 0x80 cpu_features = 0x0000c06f8f5fb1a7 possible = 0x0000fbffcf5fb1a7 always = 0x00000003800081a1 cpu_user_features = 0xdc0065c2 0xaee00000 mmu_features = 0xbc006041 firmware_features = 0x0000000010000000 hash-mmu: ppc64_pft_size = 0x0 hash-mmu: kernel vmalloc start = 0xc008000000000000 hash-mmu: kernel IO start = 0xc00a000000000000 hash-mmu: kernel vmemmap start = 0xc00c000000000000 ----------------------------------------------------- Fix: ----------------------------------------------------- phys_mem_size = 0x200000000 dcache_bsize = 0x80 icache_bsize = 0x80 cpu_features = 0x0000c06f8f5fb1a7 possible = 0x0000fbffcf5fb1a7 always = 0x00000003800081a1 cpu_user_features = 0xdc0065c2 0xaee00000 mmu_features = 0xbc006041 firmware_features = 0x0000000010000000 vmalloc start = 0xc008000000000000 IO start = 0xc00a000000000000 vmemmap start = 0xc00c000000000000 ----------------------------------------------------- Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190516020437.11783-1-npiggin@gmail.com --- arch/powerpc/kernel/setup-common.c | 8 +++++++- arch/powerpc/mm/book3s64/hash_utils.c | 3 --- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 1f8db666468d..7b4c921ec73f 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -806,9 +806,15 @@ static __init void print_system_info(void) pr_info("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); #ifdef CONFIG_PPC64 pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); +#ifdef CONFIG_PPC_BOOK3S + pr_info("vmalloc start = 0x%lx\n", KERN_VIRT_START); + pr_info("IO start = 0x%lx\n", KERN_IO_START); + pr_info("vmemmap start = 0x%lx\n", (unsigned long)vmemmap); +#endif #endif - print_system_hash_info(); + if (!early_radix_enabled()) + print_system_hash_info(); if (PHYSICAL_START > 0) pr_info("physical_start = 0x%llx\n", diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b8ad14bb1170..e6d471058597 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1957,7 +1957,4 @@ void __init print_system_hash_info(void) if (htab_hash_mask) pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); - pr_info("kernel vmalloc start = 0x%lx\n", KERN_VIRT_START); - pr_info("kernel IO start = 0x%lx\n", KERN_IO_START); - pr_info("kernel vmemmap start = 0x%lx\n", (unsigned long)vmemmap); } -- cgit v1.2.3 From c784be435d5dae28d3b03db31753dd7a18733f0c Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Wed, 15 May 2019 13:15:52 +0530 Subject: powerpc/pseries: Fix cpu_hotplug_lock acquisition in resize_hpt() The calls to arch_add_memory()/arch_remove_memory() are always made with the read-side cpu_hotplug_lock acquired via memory_hotplug_begin(). On pSeries, arch_add_memory()/arch_remove_memory() eventually call resize_hpt() which in turn calls stop_machine() which acquires the read-side cpu_hotplug_lock again, thereby resulting in the recursive acquisition of this lock. In the absence of CONFIG_PROVE_LOCKING, we hadn't observed a system lockup during a memory hotplug operation because cpus_read_lock() is a per-cpu rwsem read, which, in the fast-path (in the absence of the writer, which in our case is a CPU-hotplug operation) simply increments the read_count on the semaphore. Thus a recursive read in the fast-path doesn't cause any problems. However, we can hit this problem in practice if there is a concurrent CPU-Hotplug operation in progress which is waiting to acquire the write-side of the lock. This will cause the second recursive read to block until the writer finishes. While the writer is blocked since the first read holds the lock. Thus both the reader as well as the writers fail to make any progress thereby blocking both CPU-Hotplug as well as Memory Hotplug operations. Memory-Hotplug CPU-Hotplug CPU 0 CPU 1 ------ ------ 1. down_read(cpu_hotplug_lock.rw_sem) [memory_hotplug_begin] 2. down_write(cpu_hotplug_lock.rw_sem) [cpu_up/cpu_down] 3. down_read(cpu_hotplug_lock.rw_sem) [stop_machine()] Lockdep complains as follows in these code-paths. swapper/0/1 is trying to acquire lock: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: stop_machine+0x2c/0x60 but task is already holding lock: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(cpu_hotplug_lock.rw_sem); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** May be due to missing lock nesting notation 3 locks held by swapper/0/1: #0: (____ptrval____) (&dev->mutex){....}, at: __driver_attach+0x12c/0x1b0 #1: (____ptrval____) (cpu_hotplug_lock.rw_sem){++++}, at: mem_hotplug_begin+0x20/0x50 #2: (____ptrval____) (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x54/0x1a0 stack backtrace: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc5-58373-gbc99402235f3-dirty #166 Call Trace: dump_stack+0xe8/0x164 (unreliable) __lock_acquire+0x1110/0x1c70 lock_acquire+0x240/0x290 cpus_read_lock+0x64/0xf0 stop_machine+0x2c/0x60 pseries_lpar_resize_hpt+0x19c/0x2c0 resize_hpt_for_hotplug+0x70/0xd0 arch_add_memory+0x58/0xfc devm_memremap_pages+0x5e8/0x8f0 pmem_attach_disk+0x764/0x830 nvdimm_bus_probe+0x118/0x240 really_probe+0x230/0x4b0 driver_probe_device+0x16c/0x1e0 __driver_attach+0x148/0x1b0 bus_for_each_dev+0x90/0x130 driver_attach+0x34/0x50 bus_add_driver+0x1a8/0x360 driver_register+0x108/0x170 __nd_driver_register+0xd0/0xf0 nd_pmem_driver_init+0x34/0x48 do_one_initcall+0x1e0/0x45c kernel_init_freeable+0x540/0x64c kernel_init+0x2c/0x160 ret_from_kernel_thread+0x5c/0x68 Fix this issue by 1) Requiring all the calls to pseries_lpar_resize_hpt() be made with cpu_hotplug_lock held. 2) In pseries_lpar_resize_hpt() invoke stop_machine_cpuslocked() as a consequence of 1) 3) To satisfy 1), in hpt_order_set(), call mmu_hash_ops.resize_hpt() with cpu_hotplug_lock held. Fixes: dbcf929c0062 ("powerpc/pseries: Add support for hash table resizing") Cc: stable@vger.kernel.org # v4.11+ Reported-by: Aneesh Kumar K.V Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1557906352-29048-1-git-send-email-ego@linux.vnet.ibm.com --- arch/powerpc/mm/book3s64/hash_utils.c | 9 ++++++++- arch/powerpc/platforms/pseries/lpar.c | 8 ++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index e6d471058597..c363e850550e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1931,10 +1932,16 @@ static int hpt_order_get(void *data, u64 *val) static int hpt_order_set(void *data, u64 val) { + int ret; + if (!mmu_hash_ops.resize_hpt) return -ENODEV; - return mmu_hash_ops.resize_hpt(val); + cpus_read_lock(); + ret = mmu_hash_ops.resize_hpt(val); + cpus_read_unlock(); + + return ret; } DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 09bb878c21e0..4f76e5f30c97 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1413,7 +1413,10 @@ static int pseries_lpar_resize_hpt_commit(void *data) return 0; } -/* Must be called in user context */ +/* + * Must be called in process context. The caller must hold the + * cpus_lock. + */ static int pseries_lpar_resize_hpt(unsigned long shift) { struct hpt_resize_state state = { @@ -1467,7 +1470,8 @@ static int pseries_lpar_resize_hpt(unsigned long shift) t1 = ktime_get(); - rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL); + rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit, + &state, NULL); t2 = ktime_get(); -- cgit v1.2.3 From 56090a3902c80c296e822d11acdb6a101b322c52 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 18 Jul 2019 15:11:36 +1000 Subject: powerpc/powernv/ioda: Fix race in TCE level allocation pnv_tce() returns a pointer to a TCE entry and originally a TCE table would be pre-allocated. For the default case of 2GB window the table needs only a single level and that is fine. However if more levels are requested, it is possible to get a race when 2 threads want a pointer to a TCE entry from the same page of TCEs. This adds cmpxchg to handle the race. Note that once TCE is non-zero, it cannot become zero again. Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on demand") CC: stable@vger.kernel.org # v4.19+ Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190718051139.74787-2-aik@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index e28f03e1eb5e..8d6569590161 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -48,6 +48,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) return addr; } +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned int levels); + static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; @@ -57,9 +60,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) while (level) { int n = (idx & mask) >> (level * shift); - unsigned long tce; + unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n])); - if (tmp[n] == 0) { + if (!tce) { __be64 *tmp2; if (!alloc) @@ -70,10 +73,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) if (!tmp2) return NULL; - tmp[n] = cpu_to_be64(__pa(tmp2) | - TCE_PCI_READ | TCE_PCI_WRITE); + tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE; + oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0, + cpu_to_be64(tce))); + if (oldtce) { + pnv_pci_ioda2_table_do_free_pages(tmp2, + ilog2(tbl->it_level_size) + 3, 1); + tce = oldtce; + } } - tce = be64_to_cpu(tmp[n]); tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); idx &= ~mask; -- cgit v1.2.3 From 4f7e0babbc7c46fc0db4f5c14fe96bf6f4d69502 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 18 Jul 2019 15:11:37 +1000 Subject: powerpc/iommu: Allow bypass-only for DMA POWER8 and newer support a bypass mode which maps all host memory to PCI buses so an IOMMU table is not always required. However if we fail to create such a table, the DMA setup fails and the kernel does not boot. This skips the 32bit DMA setup check if the bypass is selected. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190718051139.74787-3-aik@ozlabs.ru --- arch/powerpc/kernel/dma-iommu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index a0879674a9c8..c963d704fa31 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -122,18 +122,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) { struct iommu_table *tbl = get_iommu_table_base(dev); - if (!tbl) { - dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx" - ", table unavailable\n", mask); - return 0; - } - if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { dev->archdata.iommu_bypass = true; dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); return 1; } + if (!tbl) { + dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask); + return 0; + } + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", -- cgit v1.2.3 From c37c792dec0929dbb6360a609fb00fa20bb16fc2 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 18 Jul 2019 15:11:38 +1000 Subject: powerpc/powernv/ioda2: Allocate TCE table levels on demand for default DMA window We allocate only the first level of multilevel TCE tables for KVM already (alloc_userspace_copy==true), and the rest is allocated on demand. This is not enabled though for bare metal. This removes the KVM limitation (implicit, via the alloc_userspace_copy parameter) and always allocates just the first level. The on-demand allocation of missing levels is already implemented. As from now on DMA map might happen with disabled interrupts, this allocates TCEs with GFP_ATOMIC; otherwise lockdep reports errors 1]. In practice just a single page is allocated there so chances for failure are quite low. To save time when creating a new clean table, this skips non-allocated indirect TCE entries in pnv_tce_free just like we already do in the VFIO IOMMU TCE driver. This changes the default level number from 1 to 2 to reduce the amount of memory required for the default 32bit DMA window at the boot time. The default window size is up to 2GB which requires 4MB of TCEs which is unlikely to be used entirely or at all as most devices these days are 64bit capable so by switching to 2 levels by default we save 4032KB of RAM per a device. While at this, add __GFP_NOWARN to alloc_pages_node() as the userspace can trigger this path via VFIO, see the failure and try creating a table again with different parameters which might succeed. [1]: === BUG: sleeping function called from invalid context at mm/page_alloc.c:4596 in_atomic(): 1, irqs_disabled(): 1, pid: 1038, name: scsi_eh_1 2 locks held by scsi_eh_1/1038: #0: 000000005efd659a (&host->eh_mutex){+.+.}, at: ata_eh_acquire+0x34/0x80 #1: 0000000006cf56a6 (&(&host->lock)->rlock){....}, at: ata_exec_internal_sg+0xb0/0x5c0 irq event stamp: 500 hardirqs last enabled at (499): [] _raw_spin_unlock_irqrestore+0x94/0xd0 hardirqs last disabled at (500): [] _raw_spin_lock_irqsave+0x44/0x120 softirqs last enabled at (0): [] copy_process.isra.4.part.5+0x640/0x1a80 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 73 PID: 1038 Comm: scsi_eh_1 Not tainted 5.2.0-rc6-le_nv2_aikATfstn1-p1 #634 Call Trace: [c000003d064cef50] [c000000000c8e6c4] dump_stack+0xe8/0x164 (unreliable) [c000003d064cefa0] [c00000000014ed78] ___might_sleep+0x2f8/0x310 [c000003d064cf020] [c0000000003ca084] __alloc_pages_nodemask+0x2a4/0x1560 [c000003d064cf220] [c0000000000c2530] pnv_alloc_tce_level.isra.0+0x90/0x130 [c000003d064cf290] [c0000000000c2888] pnv_tce+0x128/0x3b0 [c000003d064cf360] [c0000000000c2c00] pnv_tce_build+0xb0/0xf0 [c000003d064cf3c0] [c0000000000bbd9c] pnv_ioda2_tce_build+0x3c/0xb0 [c000003d064cf400] [c00000000004cfe0] ppc_iommu_map_sg+0x210/0x550 [c000003d064cf510] [c00000000004b7a4] dma_iommu_map_sg+0x74/0xb0 [c000003d064cf530] [c000000000863944] ata_qc_issue+0x134/0x470 [c000003d064cf5b0] [c000000000863ec4] ata_exec_internal_sg+0x244/0x5c0 [c000003d064cf700] [c0000000008642d0] ata_exec_internal+0x90/0xe0 [c000003d064cf780] [c0000000008650ac] ata_dev_read_id+0x2ec/0x640 [c000003d064cf8d0] [c000000000878e28] ata_eh_recover+0x948/0x16d0 [c000003d064cfa10] [c00000000087d760] sata_pmp_error_handler+0x480/0xbf0 [c000003d064cfbc0] [c000000000884624] ahci_error_handler+0x74/0xe0 [c000003d064cfbf0] [c000000000879fa8] ata_scsi_port_error_handler+0x2d8/0x7c0 [c000003d064cfca0] [c00000000087a544] ata_scsi_error+0xb4/0x100 [c000003d064cfd00] [c000000000802450] scsi_error_handler+0x120/0x510 [c000003d064cfdb0] [c000000000140c48] kthread+0x1b8/0x1c0 [c000003d064cfe20] [c00000000000bd8c] ret_from_kernel_thread+0x5c/0x70 ata1: SATA link up 6.0 Gbps (SStatus 133 SControl 300) irq event stamp: 2305 ======================================================== hardirqs last enabled at (2305): [] fast_exc_return_irq+0x28/0x34 hardirqs last disabled at (2303): [] __do_softirq+0x4a0/0x654 WARNING: possible irq lock inversion dependency detected 5.2.0-rc6-le_nv2_aikATfstn1-p1 #634 Tainted: G W softirqs last enabled at (2304): [] __do_softirq+0x524/0x654 softirqs last disabled at (2297): [] irq_exit+0x128/0x180 -------------------------------------------------------- swapper/0/0 just changed the state of lock: 0000000006cf56a6 (&(&host->lock)->rlock){-...}, at: ahci_single_level_irq_intr+0xac/0x120 but this lock took another, HARDIRQ-unsafe lock in the past: (fs_reclaim){+.+.} and interrupts could create inverse lock ordering between them. other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); local_irq_disable(); lock(&(&host->lock)->rlock); lock(fs_reclaim); lock(&(&host->lock)->rlock); *** DEADLOCK *** no locks held by swapper/0/0. the shortest dependencies between 2nd lock and 1st lock: -> (fs_reclaim){+.+.} ops: 167579 { HARDIRQ-ON-W at: lock_acquire+0xf8/0x2a0 fs_reclaim_acquire.part.23+0x44/0x60 kmem_cache_alloc_node_trace+0x80/0x590 alloc_desc+0x64/0x270 __irq_alloc_descs+0x2e4/0x3a0 irq_domain_alloc_descs+0xb0/0x150 irq_create_mapping+0x168/0x2c0 xics_smp_probe+0x2c/0x98 pnv_smp_probe+0x40/0x9c smp_prepare_cpus+0x524/0x6c4 kernel_init_freeable+0x1b4/0x650 kernel_init+0x2c/0x148 ret_from_kernel_thread+0x5c/0x70 SOFTIRQ-ON-W at: lock_acquire+0xf8/0x2a0 fs_reclaim_acquire.part.23+0x44/0x60 kmem_cache_alloc_node_trace+0x80/0x590 alloc_desc+0x64/0x270 __irq_alloc_descs+0x2e4/0x3a0 irq_domain_alloc_descs+0xb0/0x150 irq_create_mapping+0x168/0x2c0 xics_smp_probe+0x2c/0x98 pnv_smp_probe+0x40/0x9c smp_prepare_cpus+0x524/0x6c4 kernel_init_freeable+0x1b4/0x650 kernel_init+0x2c/0x148 ret_from_kernel_thread+0x5c/0x70 INITIAL USE at: lock_acquire+0xf8/0x2a0 fs_reclaim_acquire.part.23+0x44/0x60 kmem_cache_alloc_node_trace+0x80/0x590 alloc_desc+0x64/0x270 __irq_alloc_descs+0x2e4/0x3a0 irq_domain_alloc_descs+0xb0/0x150 irq_create_mapping+0x168/0x2c0 xics_smp_probe+0x2c/0x98 pnv_smp_probe+0x40/0x9c smp_prepare_cpus+0x524/0x6c4 kernel_init_freeable+0x1b4/0x650 kernel_init+0x2c/0x148 ret_from_kernel_thread+0x5c/0x70 } === Signed-off-by: Alexey Kardashevskiy Reviewed-by: Alistair Popple Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190718051139.74787-4-aik@ozlabs.ru --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 20 ++++++++++---------- arch/powerpc/platforms/powernv/pci.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 8d6569590161..a0b9c0c23ed2 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) struct page *tce_mem = NULL; __be64 *addr; - tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN, + shift - PAGE_SHIFT); if (!tce_mem) { pr_err("Failed to allocate a TCE memory, level shift=%d\n", shift); @@ -169,6 +170,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) if (ptce) *ptce = cpu_to_be64(0); + else + /* Skip the rest of the level */ + i |= tbl->it_level_size - 1; } } @@ -268,7 +272,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, unsigned int table_shift = max_t(unsigned int, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; - unsigned int tmplevels = levels; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; @@ -276,9 +279,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!is_power_of_2(window_size)) return -EINVAL; - if (alloc_userspace_copy && (window_size > (1ULL << 32))) - tmplevels = 1; - /* Adjust direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; level_shift = entries_shift + 3; @@ -289,7 +289,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, &total_allocated); + 1, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -300,18 +300,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (tmplevels == levels && offset < tce_table_size) + if (levels == 1 && offset < tce_table_size) goto free_tces_exit; /* Allocate userspace view of the TCE table */ if (alloc_userspace_copy) { offset = 0; uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, + 1, tce_table_size, &offset, &total_allocated_uas); if (!uas) goto free_tces_exit; - if (tmplevels == levels && (offset < tce_table_size || + if (levels == 1 && (offset < tce_table_size || total_allocated_uas != total_allocated)) goto free_uas_exit; } @@ -326,7 +326,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base, - tbl->it_userspace, tmplevels, levels); + tbl->it_userspace, 1, levels); return 0; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 469c24463247..f914f0b14e4e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -219,7 +219,7 @@ extern struct iommu_table_group *pnv_npu_compound_attach( struct pnv_ioda_pe *pe); /* pci-ioda-tce.c */ -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_DEFAULT_LEVELS 2 #define POWERNV_IOMMU_MAX_LEVELS 5 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, -- cgit v1.2.3 From 201ed7f327a17577debec52c33786d4b3259d0dc Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 18 Jul 2019 15:11:39 +1000 Subject: powerpc/powernv/ioda2: Create bigger default window with 64k IOMMU pages At the moment we create a small window only for 32bit devices, the window maps 0..2GB of the PCI space only. For other devices we either use a sketchy bypass or hardware bypass but the former can only work if the amount of RAM is no bigger than the device's DMA mask and the latter requires devices to support at least 59bit DMA. This extends the default DMA window to the maximum size possible to allow a wider DMA mask than just 32bit. The default window size is now limited by the the iommu_table::it_map allocation bitmap which is a contiguous array, 1 bit per an IOMMU page. This increases the default IOMMU page size from hard coded 4K to the system page size to allow wider DMA masks. This increases the level number to not exceed the max order allocation limit per TCE level. By the same time, this keeps minimal levels number as 2 in order to save memory. As the extended window now overlaps the 32bit MMIO region, this adds an area reservation to iommu_init_table(). After this change the default window size is 0x80000000000==1<<43 so devices limited to DMA mask smaller than the amount of system RAM can still use more than just 2GB of memory for DMA. This is an optimization and not a bug fix for DMA API usage. With the on-demand allocation of indirect TCE table levels enabled and 2 levels, the first TCE level size is just 1< Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190718051139.74787-5-aik@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 7 ++- arch/powerpc/kernel/iommu.c | 74 ++++++++++++++++++++++--------- arch/powerpc/platforms/cell/iommu.c | 2 +- arch/powerpc/platforms/pasemi/iommu.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 42 +++++++++++++++--- arch/powerpc/platforms/pseries/iommu.c | 8 ++-- arch/powerpc/platforms/pseries/vio.c | 2 +- arch/powerpc/sysdev/dart_iommu.c | 2 +- 8 files changed, 100 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 18d342b815e4..d7bf1f104c15 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -111,6 +111,8 @@ struct iommu_table { struct iommu_table_ops *it_ops; struct kref it_kref; int it_nid; + unsigned long it_reserved_start; /* Start of not-DMA-able (MMIO) area */ + unsigned long it_reserved_end; }; #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \ @@ -149,8 +151,9 @@ extern int iommu_tce_table_put(struct iommu_table *tbl); /* Initializes an iommu_table based in values set in the passed-in * structure */ -extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, - int nid); +extern struct iommu_table *iommu_init_table(struct iommu_table *tbl, + int nid, unsigned long res_start, unsigned long res_end); + #define IOMMU_TABLE_GROUP_MAX_TABLES 2 struct iommu_table_group; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0a67ce9f827e..e7a2b160d4c6 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl) #endif } +static void iommu_table_reserve_pages(struct iommu_table *tbl, + unsigned long res_start, unsigned long res_end) +{ + int i; + + WARN_ON_ONCE(res_end < res_start); + /* + * Reserve page 0 so it will not be used for any mappings. + * This avoids buggy drivers that consider page 0 to be invalid + * to crash the machine or even lose data. + */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + tbl->it_reserved_start = res_start; + tbl->it_reserved_end = res_end; + + /* Check if res_start..res_end isn't empty and overlaps the table */ + if (res_start && res_end && + (tbl->it_offset + tbl->it_size < res_start || + res_end < tbl->it_offset)) + return; + + for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) + set_bit(i - tbl->it_offset, tbl->it_map); +} + +static void iommu_table_release_pages(struct iommu_table *tbl) +{ + int i; + + /* + * In case we have reserved the first bit, we should not emit + * the warning below. + */ + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + + for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) + clear_bit(i - tbl->it_offset, tbl->it_map); +} + /* * Build a iommu_table structure. This contains a bit map which * is used to manage allocation of the tce space. */ -struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) +struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, + unsigned long res_start, unsigned long res_end) { unsigned long sz; static int welcomed = 0; @@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) tbl->it_map = page_address(page); memset(tbl->it_map, 0, sz); - /* - * Reserve page 0 so it will not be used for any mappings. - * This avoids buggy drivers that consider page 0 to be invalid - * to crash the machine or even lose data. - */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + iommu_table_reserve_pages(tbl, res_start, res_end); /* We only split the IOMMU table if we have 1GB or more of space */ if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) @@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref) return; } - /* - * In case we have reserved the first bit, we should not emit - * the warning below. - */ - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + iommu_table_release_pages(tbl); /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) @@ -1024,15 +1056,14 @@ int iommu_take_ownership(struct iommu_table *tbl) for (i = 0; i < tbl->nr_pools; i++) spin_lock(&tbl->pools[i].lock); - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + iommu_table_release_pages(tbl); if (!bitmap_empty(tbl->it_map, tbl->it_size)) { pr_err("iommu_tce: it_map is not empty"); ret = -EBUSY; - /* Restore bit#0 set by iommu_init_table() */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ + iommu_table_reserve_pages(tbl, tbl->it_reserved_start, + tbl->it_reserved_end); } else { memset(tbl->it_map, 0xff, sz); } @@ -1055,9 +1086,8 @@ void iommu_release_ownership(struct iommu_table *tbl) memset(tbl->it_map, 0, sz); - /* Restore bit#0 set by iommu_init_table() */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + iommu_table_reserve_pages(tbl, tbl->it_reserved_start, + tbl->it_reserved_end); for (i = 0; i < tbl->nr_pools; i++) spin_unlock(&tbl->pools[i].lock); diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 16dfee29aa41..ca9ffc1c8685 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -486,7 +486,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, window->table.it_size = size >> window->table.it_page_shift; window->table.it_ops = &cell_iommu_ops; - iommu_init_table(&window->table, iommu->nid); + iommu_init_table(&window->table, iommu->nid, 0, 0); pr_debug("\tioid %d\n", window->ioid); pr_debug("\tblocksize %ld\n", window->table.it_blocksize); diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index 77fee09104f8..b500a6e47e6b 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -146,7 +146,7 @@ static void iommu_table_iobmap_setup(void) */ iommu_table_iobmap.it_blocksize = 4; iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; - iommu_init_table(&iommu_table_iobmap, 0); + iommu_init_table(&iommu_table_iobmap, 0, 0, 0); pr_debug(" <- %s\n", __func__); } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d8080558d020..ec48ea25a674 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2303,7 +2303,7 @@ found: tbl->it_ops = &pnv_ioda1_iommu_ops; pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; - iommu_init_table(tbl, phb->hose->node); + iommu_init_table(tbl, phb->hose->node, 0, 0); if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus); @@ -2420,6 +2420,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = NULL; long rc; + unsigned long res_start, res_end; /* * crashkernel= specifies the kdump kernel's maximum memory at @@ -2433,19 +2434,46 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1); - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); + /* + * We create the default window as big as we can. The constraint is + * the max order of allocation possible. The TCE table is likely to + * end up being multilevel and with on-demand allocation in place, + * the initial use is not going to be huge as the default window aims + * to support crippled devices (i.e. not fully 64bit DMAble) only. + */ + /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */ + const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory); + /* Each TCE level cannot exceed maxblock so go multilevel if needed */ + unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT); + unsigned long tcelevel_order = ilog2(maxblock >> 3); + unsigned int levels = tces_order / tcelevel_order; + + if (tces_order % tcelevel_order) + levels += 1; + /* + * We try to stick to default levels (which is >1 at the moment) in + * order to save memory by relying on on-demain TCE level allocation. + */ + levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS); + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT, + window_size, levels, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); return rc; } - iommu_init_table(tbl, pe->phb->hose->node); + /* We use top part of 32bit space for MMIO so exclude it from DMA */ + res_start = 0; + res_end = 0; + if (window_size > pe->phb->ioda.m32_pci_base) { + res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift; + res_end = min(window_size, SZ_4G) >> tbl->it_page_shift; + } + iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end); rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); if (rc) { diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 889dc2e44b89..42fb03253334 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -609,7 +609,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) iommu_table_setparms(pci->phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -690,7 +690,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, ppci->phb->node); + iommu_init_table(tbl, ppci->phb->node, 0, 0); iommu_register_group(ppci->table_group, pci_domain_nr(bus), 0); pr_debug(" created table: %p\n", ppci->table_group); @@ -719,7 +719,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); tbl->it_ops = &iommu_table_pseries_ops; - iommu_init_table(tbl, phb->node); + iommu_init_table(tbl, phb->node, 0, 0); set_iommu_table_base(&dev->dev, tbl); return; } @@ -1169,7 +1169,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) iommu_table_setparms_lpar(pci->phb, pdn, tbl, pci->table_group, dma_window); tbl->it_ops = &iommu_table_lpar_multi_ops; - iommu_init_table(tbl, pci->phb->node); + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); pr_debug(" created table: %p\n", pci->table_group); diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 6601b9d404dc..115934f83935 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1191,7 +1191,7 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) else tbl->it_ops = &iommu_table_pseries_ops; - return iommu_init_table(tbl, -1); + return iommu_init_table(tbl, -1, 0, 0); } /** diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 21a1fae0714e..6b4a34b36d98 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -344,7 +344,7 @@ static void iommu_table_dart_setup(void) iommu_table_dart.it_index = 0; iommu_table_dart.it_blocksize = 1; iommu_table_dart.it_ops = &iommu_dart_ops; - iommu_init_table(&iommu_table_dart, -1); + iommu_init_table(&iommu_table_dart, -1, 0, 0); /* Reserve the last page of the DART to avoid possible prefetch * past the DART mapped area -- cgit v1.2.3 From c3e0dbd7f780a58c4695f1cd8fc8afde80376737 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Wed, 14 Aug 2019 17:47:52 +0200 Subject: powerpc/xmon: Check for HV mode when dumping XIVE info from OPAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the xmon 'dx' command calls OPAL to dump the XIVE state in the OPAL logs and also outputs some of the fields of the internal XIVE structures in Linux. The OPAL calls can only be done on baremetal (PowerNV) and they crash a pseries machine. Fix by checking the hypervisor feature of the CPU. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190814154754.23682-2-clg@kaod.org --- arch/powerpc/xmon/xmon.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 14e56c25879f..25d4adccf750 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2534,13 +2534,16 @@ static void dump_pacas(void) static void dump_one_xive(int cpu) { unsigned int hwid = get_hard_smp_processor_id(cpu); - - opal_xive_dump(XIVE_DUMP_TM_HYP, hwid); - opal_xive_dump(XIVE_DUMP_TM_POOL, hwid); - opal_xive_dump(XIVE_DUMP_TM_OS, hwid); - opal_xive_dump(XIVE_DUMP_TM_USER, hwid); - opal_xive_dump(XIVE_DUMP_VP, hwid); - opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid); + bool hv = cpu_has_feature(CPU_FTR_HVMODE); + + if (hv) { + opal_xive_dump(XIVE_DUMP_TM_HYP, hwid); + opal_xive_dump(XIVE_DUMP_TM_POOL, hwid); + opal_xive_dump(XIVE_DUMP_TM_OS, hwid); + opal_xive_dump(XIVE_DUMP_TM_USER, hwid); + opal_xive_dump(XIVE_DUMP_VP, hwid); + opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid); + } if (setjmp(bus_error_jmp) != 0) { catch_memory_errors = 0; -- cgit v1.2.3 From b4868ff55d082bc66b0c287a41e4888f6d3e5f87 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Wed, 14 Aug 2019 17:47:53 +0200 Subject: powerpc/xive: Fix dump of XIVE interrupt under pseries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xmon 'dxi' command calls OPAL to query the XIVE configuration of a interrupt. This can only be done on baremetal (PowerNV) and it will crash a pseries machine. Introduce a new XIVE get_irq_config() operation which implements a different query depending on the platform, PowerNV or pseries, and modify xmon to use a top level wrapper. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190814154754.23682-3-clg@kaod.org --- arch/powerpc/include/asm/xive.h | 2 ++ arch/powerpc/sysdev/xive/common.c | 7 +++++ arch/powerpc/sysdev/xive/native.c | 15 ++++++++++ arch/powerpc/sysdev/xive/spapr.c | 51 ++++++++++++++++++++++++++++++++ arch/powerpc/sysdev/xive/xive-internal.h | 2 ++ arch/powerpc/xmon/xmon.c | 12 ++++---- 6 files changed, 83 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index efb0e597b272..967d6ab3c977 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -99,6 +99,8 @@ extern void xive_flush_interrupt(void); /* xmon hook */ extern void xmon_xive_do_dump(int cpu); +extern int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio, + u32 *sw_irq); /* APIs used by KVM */ extern u32 xive_native_default_eq_shift(void); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 6b973b7cdd8a..f75a660365e5 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -257,6 +257,13 @@ notrace void xmon_xive_do_dump(int cpu) } #endif } + +int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio, + u32 *sw_irq) +{ + return xive_ops->get_irq_config(irq, target, prio, sw_irq); +} + #endif /* CONFIG_XMON */ static unsigned int xive_get_irq(void) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 2f26b74f6cfa..4b61e44f0171 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -111,6 +111,20 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) } EXPORT_SYMBOL_GPL(xive_native_configure_irq); +static int xive_native_get_irq_config(u32 hw_irq, u32 *target, u8 *prio, + u32 *sw_irq) +{ + s64 rc; + __be64 vp; + __be32 lirq; + + rc = opal_xive_get_irq_config(hw_irq, &vp, prio, &lirq); + + *target = be64_to_cpu(vp); + *sw_irq = be32_to_cpu(lirq); + + return rc == 0 ? 0 : -ENXIO; +} /* This can be called multiple time to change a queue configuration */ int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, @@ -442,6 +456,7 @@ EXPORT_SYMBOL_GPL(xive_native_sync_queue); static const struct xive_ops xive_native_ops = { .populate_irq_data = xive_native_populate_irq_data, .configure_irq = xive_native_configure_irq, + .get_irq_config = xive_native_get_irq_config, .setup_queue = xive_native_setup_queue, .cleanup_queue = xive_native_cleanup_queue, .match = xive_native_match, diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index 52198131c75e..33c10749edec 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -215,6 +215,38 @@ static long plpar_int_set_source_config(unsigned long flags, return 0; } +static long plpar_int_get_source_config(unsigned long flags, + unsigned long lisn, + unsigned long *target, + unsigned long *prio, + unsigned long *sw_irq) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + + pr_devel("H_INT_GET_SOURCE_CONFIG flags=%lx lisn=%lx\n", flags, lisn); + + do { + rc = plpar_hcall(H_INT_GET_SOURCE_CONFIG, retbuf, flags, lisn, + target, prio, sw_irq); + } while (plpar_busy_delay(rc)); + + if (rc) { + pr_err("H_INT_GET_SOURCE_CONFIG lisn=%ld failed %ld\n", + lisn, rc); + return rc; + } + + *target = retbuf[0]; + *prio = retbuf[1]; + *sw_irq = retbuf[2]; + + pr_devel("H_INT_GET_SOURCE_CONFIG target=%lx prio=%lx sw_irq=%lx\n", + retbuf[0], retbuf[1], retbuf[2]); + + return 0; +} + static long plpar_int_get_queue_info(unsigned long flags, unsigned long target, unsigned long priority, @@ -398,6 +430,24 @@ static int xive_spapr_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) return rc == 0 ? 0 : -ENXIO; } +static int xive_spapr_get_irq_config(u32 hw_irq, u32 *target, u8 *prio, + u32 *sw_irq) +{ + long rc; + unsigned long h_target; + unsigned long h_prio; + unsigned long h_sw_irq; + + rc = plpar_int_get_source_config(0, hw_irq, &h_target, &h_prio, + &h_sw_irq); + + *target = h_target; + *prio = h_prio; + *sw_irq = h_sw_irq; + + return rc == 0 ? 0 : -ENXIO; +} + /* This can be called multiple time to change a queue configuration */ static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio, __be32 *qpage, u32 order) @@ -590,6 +640,7 @@ static void xive_spapr_sync_source(u32 hw_irq) static const struct xive_ops xive_spapr_ops = { .populate_irq_data = xive_spapr_populate_irq_data, .configure_irq = xive_spapr_configure_irq, + .get_irq_config = xive_spapr_get_irq_config, .setup_queue = xive_spapr_setup_queue, .cleanup_queue = xive_spapr_cleanup_queue, .match = xive_spapr_match, diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h index 211725dbf364..59cd366e7933 100644 --- a/arch/powerpc/sysdev/xive/xive-internal.h +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -33,6 +33,8 @@ struct xive_cpu { struct xive_ops { int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data); int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); + int (*get_irq_config)(u32 hw_irq, u32 *target, u8 *prio, + u32 *sw_irq); int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 25d4adccf750..4ea53e05053f 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2574,14 +2574,14 @@ static void dump_all_xives(void) static void dump_one_xive_irq(u32 num) { - s64 rc; - __be64 vp; + int rc; + u32 target; u8 prio; - __be32 lirq; + u32 lirq; - rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq); - xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n", - num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc); + rc = xmon_xive_get_irq_config(num, &target, &prio, &lirq); + xmon_printf("IRQ 0x%08x : target=0x%x prio=%d lirq=0x%x (rc=%d)\n", + num, target, prio, lirq, rc); } static void dump_xives(void) -- cgit v1.2.3 From 39f14e79b15a40709ef177bc4c07f193b6d3bce3 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Wed, 14 Aug 2019 17:47:54 +0200 Subject: powerpc/xmon: Add a dump of all XIVE interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify the xmon 'dxi' command to query all interrupts if no IRQ number is specified. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190814154754.23682-4-clg@kaod.org --- arch/powerpc/xmon/xmon.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 4ea53e05053f..dc9832e06256 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2584,6 +2584,25 @@ static void dump_one_xive_irq(u32 num) num, target, prio, lirq, rc); } +static void dump_all_xive_irq(void) +{ + unsigned int i; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + struct irq_data *d = irq_desc_get_irq_data(desc); + unsigned int hwirq; + + if (!d) + continue; + + hwirq = (unsigned int)irqd_to_hwirq(d); + /* IPIs are special (HW number 0) */ + if (hwirq) + dump_one_xive_irq(hwirq); + } +} + static void dump_xives(void) { unsigned long num; @@ -2601,6 +2620,8 @@ static void dump_xives(void) } else if (c == 'i') { if (scanhex(&num)) dump_one_xive_irq(num); + else + dump_all_xive_irq(); return; } -- cgit v1.2.3 From 7c7a532ba3fc51bf9527d191fb410786c1fdc73c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 12:36:09 +0000 Subject: powerpc/ptdump: Fix addresses display on PPC32 Commit 453d87f6a8ae ("powerpc/mm: Warn if W+X pages found on boot") wrongly changed KERN_VIRT_START from 0 to PAGE_OFFSET, leading to a shift in the displayed addresses. Lets revert that change to resync walk_pagetables()'s addr val and pgd_t pointer for PPC32. Fixes: 453d87f6a8ae ("powerpc/mm: Warn if W+X pages found on boot") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/eb4d626514e22f85814830012642329018ef6af9.1565786091.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/ptdump/ptdump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 6a88a9f585d4..3ad64fc11419 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -27,7 +27,7 @@ #include "ptdump.h" #ifdef CONFIG_PPC32 -#define KERN_VIRT_START PAGE_OFFSET +#define KERN_VIRT_START 0 #endif /* -- cgit v1.2.3 From e033829d2aaad85cb7cf46986c3be0bcc72f791e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 12:36:10 +0000 Subject: powerpc/ptdump: fix walk_pagetables() address mismatch walk_pagetables() always walk the entire pgdir from address 0 but considers PAGE_OFFSET or KERN_VIRT_START as the starting address of the walk, resulting in a possible mismatch in the displayed addresses. Ex: on PPC32, when KERN_VIRT_START was locally defined as PAGE_OFFSET, ptdump displayed 0x80000000 instead of 0xc0000000 for the first kernel page, because 0xc0000000 + 0xc0000000 = 0x80000000 Start the walk at st->start_address instead of starting at 0. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5aa2ac513295f594cce8ddb1c649f61947bd063d.1565786091.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/ptdump/ptdump.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 3ad64fc11419..74ff2bff4ea0 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -299,17 +299,15 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) static void walk_pagetables(struct pg_state *st) { - pgd_t *pgd = pgd_offset_k(0UL); unsigned int i; - unsigned long addr; - - addr = st->start_address; + unsigned long addr = st->start_address & PGDIR_MASK; + pgd_t *pgd = pgd_offset_k(addr); /* * Traverse the linux pagetable structure and dump pages that are in * the hash pagetable. */ - for (i = 0; i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { + for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { if (!pgd_none(*pgd) && !pgd_is_leaf(*pgd)) /* pgd exists */ walk_pud(st, pgd, addr); -- cgit v1.2.3 From 82242352728125ecd86cea4631b89a5e6be0d0fd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 12:36:11 +0000 Subject: powerpc/ptdump: drop dummy KERN_VIRT_START on PPC32 PPC32 doesn't have KERN_VIRT_START. Make PAGE_OFFSET the default starting address for the dump, and drop the dummy definition of KERN_VIRT_START. Only use KERN_VIRT_START for non radix PPC64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/334632b1df4775b0ccf3bdc8d6b201d14e3daedd.1565786091.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/ptdump/ptdump.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 74ff2bff4ea0..9a2186c133e6 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -26,10 +26,6 @@ #include "ptdump.h" -#ifdef CONFIG_PPC32 -#define KERN_VIRT_START 0 -#endif - /* * To visualise what is happening, * @@ -362,12 +358,13 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, + .start_address = PAGE_OFFSET, }; - if (radix_enabled()) - st.start_address = PAGE_OFFSET; - else +#ifdef CONFIG_PPC64 + if (!radix_enabled()) st.start_address = KERN_VIRT_START; +#endif /* Traverse kernel page tables */ walk_pagetables(&st); @@ -405,12 +402,13 @@ void ptdump_check_wx(void) .seq = NULL, .marker = address_markers, .check_wx = true, + .start_address = PAGE_OFFSET, }; - if (radix_enabled()) - st.start_address = PAGE_OFFSET; - else +#ifdef CONFIG_PPC64 + if (!radix_enabled()) st.start_address = KERN_VIRT_START; +#endif walk_pagetables(&st); -- cgit v1.2.3 From f3a2ac05894b57e78cd6cc0adba1de642034d940 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 12:36:12 +0000 Subject: powerpc/ptdump: get out of note_prot_wx() when CONFIG_PPC_DEBUG_WX is not selected. When CONFIG_PPC_DEBUG_WX, note_prot_wx() is useless. Get out of it early and inconditionnally in that case, so that GCC can kick all the code out. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ff6c8f631bd4ce3a10e0cc241eb569816187bc20.1565786091.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/ptdump/ptdump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 9a2186c133e6..ab6a572202b4 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -177,7 +177,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr) static void note_prot_wx(struct pg_state *st, unsigned long addr) { - if (!st->check_wx) + if (!IS_ENABLED(CONFIG_PPC_DEBUG_WX) || !st->check_wx) return; if (!((st->current_flags & pgprot_val(PAGE_KERNEL_X)) == pgprot_val(PAGE_KERNEL_X))) -- cgit v1.2.3 From 65e701b2d2a8593c44c8855aee2e087be7e11e75 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 12:36:13 +0000 Subject: powerpc/ptdump: drop non vital #ifdefs hashpagetable.c is only compiled when CONFIG_PPC_BOOK3S_64 is defined, so drop the test and its 'else' branch. Use IS_ENABLED(CONFIG_PPC_PSERIES) instead of #ifdef, this allows the code to be checked at any build. It is still optimised out by GCC. Use IS_ENABLED(CONFIG_PPC_64K_PAGES) instead of #ifdef. Use IS_ENABLED(CONFIG_SPARSEMEN_VMEMMAP) instead of #ifdef. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c8998ed32e4e3954b56a8dacecfe43319a2a0483.1565786091.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/plpar_wrappers.h | 6 ++++++ arch/powerpc/mm/ptdump/hashpagetable.c | 24 +++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index cff5a411e595..4497c8afb573 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -340,6 +340,12 @@ static inline long plpar_set_ciabr(unsigned long ciabr) { return 0; } + +static inline long plpar_pte_read_4(unsigned long flags, unsigned long ptex, + unsigned long *ptes) +{ + return 0; +} #endif /* CONFIG_PPC_PSERIES */ #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index 72f0e4a3d839..a07278027c6f 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -237,7 +237,6 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 return -1; } -#ifdef CONFIG_PPC_PSERIES static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { struct hash_pte ptes[4]; @@ -274,7 +273,6 @@ static int pseries_find(unsigned long ea, int psize, bool primary, u64 *v, u64 * } return -1; } -#endif static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, unsigned long *lp_bits) @@ -316,10 +314,9 @@ static void decode_r(int bps, unsigned long r, unsigned long *rpn, int *aps, static int base_hpte_find(unsigned long ea, int psize, bool primary, u64 *v, u64 *r) { -#ifdef CONFIG_PPC_PSERIES - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (IS_ENABLED(CONFIG_PPC_PSERIES) && firmware_has_feature(FW_FEATURE_LPAR)) return pseries_find(ea, psize, primary, v, r); -#endif + return native_find(ea, psize, primary, v, r); } @@ -386,12 +383,13 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) psize = mmu_vmalloc_psize; else psize = mmu_io_psize; -#ifdef CONFIG_PPC_64K_PAGES + /* check for secret 4K mappings */ - if (((pteval & H_PAGE_COMBO) == H_PAGE_COMBO) || - ((pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) + if (IS_ENABLED(CONFIG_PPC_64K_PAGES) && + ((pteval & H_PAGE_COMBO) == H_PAGE_COMBO || + (pteval & H_PAGE_4K_PFN) == H_PAGE_4K_PFN)) psize = mmu_io_psize; -#endif + /* check for hashpte */ status = hpte_find(st, addr, psize); @@ -469,9 +467,10 @@ static void walk_linearmapping(struct pg_state *st) static void walk_vmemmap(struct pg_state *st) { -#ifdef CONFIG_SPARSEMEM_VMEMMAP struct vmemmap_backing *ptr = vmemmap_list; + if (!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + return; /* * Traverse the vmemmaped memory and dump pages that are in the hash * pagetable. @@ -481,7 +480,6 @@ static void walk_vmemmap(struct pg_state *st) ptr = ptr->list; } seq_puts(st->seq, "---[ vmemmap end ]---\n"); -#endif } static void populate_markers(void) @@ -495,11 +493,7 @@ static void populate_markers(void) address_markers[6].start_address = PHB_IO_END; address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_BOOK3S_64 address_markers[9].start_address = H_VMEMMAP_START; -#else - address_markers[9].start_address = VMEMMAP_BASE; -#endif } static int ptdump_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 658d029df0bce6472c94b724ca54d74bc6659c2e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 28 Jun 2019 15:55:52 +0000 Subject: powerpc/hw_breakpoint: move instruction stepping out of hw_breakpoint_handler() On 8xx, breakpoints stop after executing the instruction, so stepping/emulation is not needed. Move it into a sub-function and remove the #ifdefs. Signed-off-by: Christophe Leroy Reviewed-by: Ravi Bangoria Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f8cdc3f1c66ad3c43ebc568abcc6c39ed4676284.1561737231.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/hw_breakpoint.c | 60 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index c8d1fa2e9d53..28ad3171bb82 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -198,15 +198,43 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) /* * Handle debug exception notifications. */ +static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, + unsigned long addr) +{ + int stepped; + unsigned int instr; + + /* Do not emulate user-space instructions, instead single-step them */ + if (user_mode(regs)) { + current->thread.last_hit_ubp = bp; + regs->msr |= MSR_SE; + return false; + } + + stepped = 0; + instr = 0; + if (!__get_user_inatomic(instr, (unsigned int *)regs->nip)) + stepped = emulate_step(regs, instr); + + /* + * emulate_step() could not execute it. We've failed in reliably + * handling the hw-breakpoint. Unregister it and throw a warning + * message to let the user know about it. + */ + if (!stepped) { + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " + "0x%lx will be disabled.", addr); + perf_event_disable_inatomic(bp); + return false; + } + return true; +} + int hw_breakpoint_handler(struct die_args *args) { int rc = NOTIFY_STOP; struct perf_event *bp; struct pt_regs *regs = args->regs; -#ifndef CONFIG_PPC_8xx - int stepped = 1; - unsigned int instr; -#endif struct arch_hw_breakpoint *info; unsigned long dar = regs->dar; @@ -251,31 +279,9 @@ int hw_breakpoint_handler(struct die_args *args) (dar - bp->attr.bp_addr < bp->attr.bp_len))) info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; -#ifndef CONFIG_PPC_8xx - /* Do not emulate user-space instructions, instead single-step them */ - if (user_mode(regs)) { - current->thread.last_hit_ubp = bp; - regs->msr |= MSR_SE; + if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info->address)) goto out; - } - - stepped = 0; - instr = 0; - if (!__get_user_inatomic(instr, (unsigned int *) regs->nip)) - stepped = emulate_step(regs, instr); - /* - * emulate_step() could not execute it. We've failed in reliably - * handling the hw-breakpoint. Unregister it and throw a warning - * message to let the user know about it. - */ - if (!stepped) { - WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " - "0x%lx will be disabled.", info->address); - perf_event_disable_inatomic(bp); - goto out; - } -#endif /* * As a policy, the callback is invoked in a 'trigger-after-execute' * fashion -- cgit v1.2.3 From 45ff3c55958542c3b76075d59741297b8cb31cbb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 9 Aug 2019 14:58:09 +0000 Subject: powerpc/kasan: Fix parallel loading of modules. Parallel loading of modules may lead to bad setup of shadow page table entries. First, lets align modules so that two modules never share the same shadow page. Second, ensure that two modules cannot allocate two page tables for the same PMD entry at the same time. This is done by using init_mm.page_table_lock in the same way as __pte_alloc_kernel() Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c97284f912128cbc3f2fe09d68e90e65fb3e6026.1565361876.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 74f4555a62ba..3282f8a2aecc 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,19 @@ static int __ref kasan_init_shadow_page_tables(unsigned long k_start, unsigned l kasan_populate_pte(new, PAGE_READONLY); else kasan_populate_pte(new, PAGE_KERNEL_RO); - pmd_populate_kernel(&init_mm, pmd, new); + + smp_wmb(); /* See comment in __pte_alloc */ + + spin_lock(&init_mm.page_table_lock); + /* Has another populated it ? */ + if (likely((void *)pmd_page_vaddr(*pmd) == kasan_early_shadow_pte)) { + pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } + spin_unlock(&init_mm.page_table_lock); + + if (new && slab_is_available()) + pte_free_kernel(&init_mm, new); } return 0; } @@ -137,7 +150,11 @@ void __init kasan_init(void) #ifdef CONFIG_MODULES void *module_alloc(unsigned long size) { - void *base = vmalloc_exec(size); + void *base; + + base = __vmalloc_node_range(size, MODULE_ALIGN, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); if (!base) return NULL; -- cgit v1.2.3 From 663c0c9496a69f80011205ba3194049bcafd681d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 9 Aug 2019 14:58:10 +0000 Subject: powerpc/kasan: Fix shadow area set up for modules. When loading modules, from time to time an Oops is encountered during the init of shadow area for globals. This is due to the last page not always being mapped depending on the exact distance between the start and the end of the shadow area and the alignment with the page addresses. Fix this by aligning the starting address with the page address. Fixes: 2edb16efc899 ("powerpc/32: Add KASAN support") Cc: stable@vger.kernel.org # v5.2+ Reported-by: Erhard F. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4f887e9b77d0d725cbb52035c7ece485c1c5fc14.1565361881.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/kasan/kasan_init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 3282f8a2aecc..802387b231ad 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -87,7 +87,7 @@ static int __ref kasan_init_region(void *start, size_t size) if (!slab_is_available()) block = memblock_alloc(k_end - k_start, PAGE_SIZE); - for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) { + for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); void *va = block ? block + k_cur - k_start : kasan_get_one_page(); pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); -- cgit v1.2.3 From 38a0d0cdb46d3f91534e5b9839ec2d67be14c59d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 09:25:52 +0000 Subject: powerpc/futex: Fix warning: 'oldval' may be used uninitialized in this function We see warnings such as: kernel/futex.c: In function 'do_futex': kernel/futex.c:1676:17: warning: 'oldval' may be used uninitialized in this function [-Wmaybe-uninitialized] return oldval == cmparg; ^ kernel/futex.c:1651:6: note: 'oldval' was declared here int oldval, ret; ^ This is because arch_futex_atomic_op_inuser() only sets *oval if ret is 0 and GCC doesn't see that it will only use it when ret is 0. Anyway, the non-zero ret path is an error path that won't suffer from setting *oval, and as *oval is a local var in futex_atomic_op_inuser() it will have no impact. Signed-off-by: Christophe Leroy [mpe: reword change log slightly] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/86b72f0c134367b214910b27b9a6dd3321af93bb.1565774657.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/futex.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 3a6aa57b9d90..eea28ca679db 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -60,8 +60,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, pagefault_enable(); - if (!ret) - *oval = oldval; + *oval = oldval; prevent_write_to_user(uaddr, sizeof(*uaddr)); return ret; -- cgit v1.2.3 From 9d6d712fbf7766f21c838940eebcd7b4d476c5e6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 10:02:20 +0000 Subject: powerpc/32s: Fix boot failure with DEBUG_PAGEALLOC without KASAN. When KASAN is selected, the definitive hash table has to be set up later, but there is already an early temporary one. When KASAN is not selected, there is no early hash table, so the setup of the definitive hash table cannot be delayed. Fixes: 72f208c6a8f7 ("powerpc/32s: move hash code patching out of MMU_init_hw()") Cc: stable@vger.kernel.org # v5.2+ Reported-by: Jonathan Neuschafer Tested-by: Jonathan Neuschafer Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b7860c5e1e784d6b96ba67edf47dd6cbc2e78ab6.1565776892.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.S | 2 ++ arch/powerpc/mm/book3s32/mmu.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index f255e22184b4..c8b4f7ed318c 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -897,9 +897,11 @@ start_here: bl machine_init bl __save_cpu_setup bl MMU_init +#ifdef CONFIG_KASAN BEGIN_MMU_FTR_SECTION bl MMU_init_hw_patch END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) +#endif /* * Go back to running unmapped so we can load up new values diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index e249fbf6b9c3..8d68f03bf5a4 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -358,6 +358,15 @@ void __init MMU_init_hw(void) hash_mb2 = hash_mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; if (lg_n_hpteg > 16) hash_mb2 = 16 - LG_HPTEG_SIZE; + + /* + * When KASAN is selected, there is already an early temporary hash + * table and the switch to the final hash table is done later. + */ + if (IS_ENABLED(CONFIG_KASAN)) + return; + + MMU_init_hw_patch(); } void __init MMU_init_hw_patch(void) -- cgit v1.2.3 From ad628a34ec4e3558bf838195f60bbaa4c2b68f2a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 14 Aug 2019 14:36:10 +0000 Subject: powerpc/mm: don't display empty early ioremap area On the 8xx, the layout displayed at boot is: [ 0.000000] Memory: 121856K/131072K available (5728K kernel code, 592K rwdata, 1248K rodata, 560K init, 448K bss, 9216K reserved, 0K cma-reserved) [ 0.000000] Kernel virtual memory layout: [ 0.000000] * 0xffefc000..0xffffc000 : fixmap [ 0.000000] * 0xffefc000..0xffefc000 : early ioremap [ 0.000000] * 0xc9000000..0xffefc000 : vmalloc & ioremap [ 0.000000] SLUB: HWalign=16, Order=0-3, MinObjects=0, CPUs=1, Nodes=1 Remove display of an empty early ioremap. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f6267226038cb25a839b567319e240576e3f8565.1565793287.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/mem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9191a66b3bc5..828c535e8fd2 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -306,8 +306,9 @@ void __init mem_init(void) pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); #endif /* CONFIG_NOT_COHERENT_CACHE */ - pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", - ioremap_bot, IOREMAP_TOP); + if (ioremap_bot != IOREMAP_TOP) + pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", + ioremap_bot, IOREMAP_TOP); pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", VMALLOC_START, VMALLOC_END); #endif /* CONFIG_PPC32 */ -- cgit v1.2.3 From d9642117914c9d3f800b3bacc19d7e388b04edb4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 16 Aug 2019 05:41:40 +0000 Subject: powerpc/mm: define empty update_mmu_cache() as static inline Only BOOK3S and FSL_BOOK3E have a usefull update_mmu_cache(). For the others, just define it static inline. In the meantime, simplify the FSL_BOOK3E related ifdef as book3e_hugetlb_preload() only exists when CONFIG_PPC_FSL_BOOK3E is selected. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/668aba4db6b9af6d8a151174e11a4289f1a6bbcd.1565933217.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/pgtable.h | 11 +++++++++++ arch/powerpc/include/asm/nohash/pgtable.h | 13 +++++++++++++ arch/powerpc/include/asm/pgtable.h | 12 ------------ arch/powerpc/mm/mem.c | 11 +++++++---- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index 6436b65ac7bc..0e1263455d73 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -26,5 +26,16 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); #define __HAVE_PHYS_MEM_ACCESS_PROT +/* + * This gets called at the end of handling a page fault, when + * the kernel has put a new PTE into the page table for the process. + * We use it to ensure coherency between the i-cache and d-cache + * for the page which has just been mapped in. + * On machines which use an MMU hash table, we use this to put a + * corresponding HPTE into the hash table ahead of time, instead of + * waiting for the inevitable extra hash-table miss exception. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 1ca1c1864b32..7fed9dc0f147 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -293,5 +293,18 @@ static inline int pgd_huge(pgd_t pgd) #define is_hugepd(hpd) (hugepd_ok(hpd)) #endif +/* + * This gets called at the end of handling a page fault, when + * the kernel has put a new PTE into the page table for the process. + * We use it to ensure coherency between the i-cache and d-cache + * for the page which has just been mapped in. + */ +#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE) +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); +#else +static inline +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {} +#endif + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index c58ba7963688..c70916a7865a 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -77,18 +77,6 @@ extern void paging_init(void); #include - -/* - * This gets called at the end of handling a page fault, when - * the kernel has put a new PTE into the page table for the process. - * We use it to ensure coherency between the i-cache and d-cache - * for the page which has just been mapped in. - * On machines which use an MMU hash table, we use this to put a - * corresponding HPTE into the hash table ahead of time, instead of - * waiting for the inevitable extra hash-table miss exception. - */ -extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); - #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 828c535e8fd2..ee8225cd2cd2 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -415,10 +415,10 @@ EXPORT_SYMBOL(flush_icache_user_range); * * This must always be called with the pte lock held. */ +#ifdef CONFIG_PPC_BOOK3S void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { -#ifdef CONFIG_PPC_BOOK3S /* * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held @@ -456,13 +456,16 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, } hash_preload(vma->vm_mm, address, is_exec, trap); +} #endif /* CONFIG_PPC_BOOK3S */ -#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ - && defined(CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE) +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ if (is_vm_hugetlb_page(vma)) book3e_hugetlb_preload(vma, address, *ptep); -#endif } +#endif /* * System memory should not be in /proc/iomem but various tools expect it -- cgit v1.2.3 From 4c1616ef036ffaaea95a29d7b6abf9d3e8eb9d92 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 16 Aug 2019 05:41:41 +0000 Subject: powerpc/mm: move FSL_BOOK3 version of update_mmu_cache() Move FSL_BOOK3E version of update_mmu_cache() at the same place as book3e_hugetlb_preload() as update_mmu_cache() is the only user of book3e_hugetlb_preload(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4d69fdc86df9c74adc71a60331a86f6afb8b5e9e.1565933217.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/hugetlb.h | 3 --- arch/powerpc/mm/mem.c | 8 -------- arch/powerpc/mm/nohash/book3e_hugetlbpage.c | 16 ++++++++++++++-- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 20a101046cff..bd6504c28c2f 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -31,9 +31,6 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return 0; } -void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, - pte_t pte); - #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ee8225cd2cd2..58d11362eee4 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -458,14 +458,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, hash_preload(vma->vm_mm, address, is_exec, trap); } #endif /* CONFIG_PPC_BOOK3S */ -#if defined(CONFIG_PPC_FSL_BOOK3E) && defined(CONFIG_HUGETLB_PAGE) -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep) -{ - if (is_vm_hugetlb_page(vma)) - book3e_hugetlb_preload(vma, address, *ptep); -} -#endif /* * System memory should not be in /proc/iomem but various tools expect it diff --git a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c index 61915f4d3c7f..8b88be91b622 100644 --- a/arch/powerpc/mm/nohash/book3e_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/book3e_hugetlbpage.c @@ -122,8 +122,8 @@ static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) return found; } -void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, - pte_t pte) +static void +book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) { unsigned long mas1, mas2; u64 mas7_3; @@ -183,6 +183,18 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, local_irq_restore(flags); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +{ + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +} + void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { struct hstate *hstate = hstate_file(vma->vm_file); -- cgit v1.2.3 From e5a1edb9fe4cfa07e37a59475f8f7d0a8939c73e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 16 Aug 2019 05:41:42 +0000 Subject: powerpc/mm: move update_mmu_cache() into book3s hash utils. update_mmu_cache() is only for BOOK3S, and can be simplified for BOOK3S32. Move it out of mem.c into respective BOOK3S32 and BOOK3S64 files containing hash utils. BOOK3S64 version of hash_preload() is only used locally, declare it static. Remove the radix_enabled() stuff in BOOK3S32 version. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/107aaf43583a5f5d09e0d4e84c4c4390ecfcd512.1565933217.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/book3s32/mmu.c | 46 +++++++++++++++++++++++++++++ arch/powerpc/mm/book3s64/hash_utils.c | 55 +++++++++++++++++++++++++++++++++-- arch/powerpc/mm/mem.c | 52 --------------------------------- arch/powerpc/mm/mmu_decl.h | 7 ++--- 4 files changed, 102 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 8d68f03bf5a4..1e77dca8b497 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -309,6 +309,52 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, add_hash_page(mm->context.id, ea, pmd_val(*pmd)); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + unsigned long trap; + bool is_exec; + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* + * We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot. + * + * We also avoid filling the hash if not coming from a fault. + */ + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + is_exec = false; + break; + case 0x400: + is_exec = true; + break; + default: + return; + } + + hash_preload(vma->vm_mm, address, is_exec, trap); +} + /* * Initialize the hash table and patch the instructions in hashtable.S. */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index c363e850550e..c3bfef08dcf8 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1519,8 +1519,8 @@ static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) } #endif -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) +static void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap) { int hugepage_shift; unsigned long vsid; @@ -1600,6 +1600,57 @@ out_exit: local_irq_restore(flags); } +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + unsigned long trap; + bool is_exec; + + if (radix_enabled()) { + prefetch((void *)address); + return; + } + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* + * We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot. + * + * We also avoid filling the hash if not coming from a fault. + */ + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + is_exec = false; + break; + case 0x400: + is_exec = true; + break; + default: + return; + } + + hash_preload(vma->vm_mm, address, is_exec, trap); +} + #ifdef CONFIG_PPC_MEM_KEYS /* * Return the protection key associated with the given address and the diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 58d11362eee4..69f99128a8d6 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -407,58 +407,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, } EXPORT_SYMBOL(flush_icache_user_range); -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a PTE in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux PTE. - * - * This must always be called with the pte lock held. - */ -#ifdef CONFIG_PPC_BOOK3S -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep) -{ - /* - * We don't need to worry about _PAGE_PRESENT here because we are - * called with either mm->page_table_lock held or ptl lock held - */ - unsigned long trap; - bool is_exec; - - if (radix_enabled()) { - prefetch((void *)address); - return; - } - - /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ - if (!pte_young(*ptep) || address >= TASK_SIZE) - return; - - /* We try to figure out if we are coming from an instruction - * access fault and pass that down to __hash_page so we avoid - * double-faulting on execution of fresh text. We have to test - * for regs NULL since init will get here first thing at boot - * - * We also avoid filling the hash if not coming from a fault - */ - - trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; - switch (trap) { - case 0x300: - is_exec = false; - break; - case 0x400: - is_exec = true; - break; - default: - return; - } - - hash_preload(vma->vm_mm, address, is_exec, trap); -} -#endif /* CONFIG_PPC_BOOK3S */ - /* * System memory should not be in /proc/iomem but various tools expect it * (eg kdump). diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 32c1a191c28a..9f325a7a09cb 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -82,10 +82,6 @@ static inline void print_system_hash_info(void) {} #else /* CONFIG_PPC_MMU_NOHASH */ -extern void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap); - - extern void _tlbie(unsigned long address); extern void _tlbia(void); @@ -95,6 +91,9 @@ void print_system_hash_info(void); #ifdef CONFIG_PPC32 +void hash_preload(struct mm_struct *mm, unsigned long ea, + bool is_exec, unsigned long trap); + extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot); -- cgit v1.2.3 From f49f4e2b68b683491263e92c229ff344d44759a7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 16 Aug 2019 05:41:43 +0000 Subject: powerpc/mm: Simplify update_mmu_cache() on BOOK3S32 On BOOK3S32, hash_preload() neither use is_exec nor trap, so drop those parameters and simplify update_mmu_cached(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/35f143c6fe29f9fd25c7f3cd4448ae401029ce3c.1565933217.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/book3s32/mmu.c | 30 +++++++----------------------- arch/powerpc/mm/mmu_decl.h | 3 +-- arch/powerpc/mm/pgtable_32.c | 2 +- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 1e77dca8b497..5d7d35eb96fb 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -297,8 +297,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, /* * Preload a translation in the hash table */ -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap) +void hash_preload(struct mm_struct *mm, unsigned long ea) { pmd_t *pmd; @@ -324,35 +323,20 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long trap; - bool is_exec; /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) return; - /* - * We try to figure out if we are coming from an instruction - * access fault and pass that down to __hash_page so we avoid - * double-faulting on execution of fresh text. We have to test - * for regs NULL since init will get here first thing at boot. - * - * We also avoid filling the hash if not coming from a fault. - */ + /* We have to test for regs NULL since init will get here first thing at boot */ + if (!current->thread.regs) + return; - trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; - switch (trap) { - case 0x300: - is_exec = false; - break; - case 0x400: - is_exec = true; - break; - default: + /* We also avoid filling the hash if not coming from a fault */ + if (TRAP(current->thread.regs) != 0x300 && TRAP(current->thread.regs) != 0x400) return; - } - hash_preload(vma->vm_mm, address, is_exec, trap); + hash_preload(vma->vm_mm, address); } /* diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 9f325a7a09cb..adbaf2167214 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -91,8 +91,7 @@ void print_system_hash_info(void); #ifdef CONFIG_PPC32 -void hash_preload(struct mm_struct *mm, unsigned long ea, - bool is_exec, unsigned long trap); +void hash_preload(struct mm_struct *mm, unsigned long ea); extern void mapin_ram(void); extern void setbat(int index, unsigned long virt, phys_addr_t phys, diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 35cb96cfc258..97f401a06fcc 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -252,7 +252,7 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL); #ifdef CONFIG_PPC_BOOK3S_32 if (ktext) - hash_preload(&init_mm, v, false, 0x300); + hash_preload(&init_mm, v); #endif v += PAGE_SIZE; p += PAGE_SIZE; -- cgit v1.2.3 From f204338f8e29777302042cba578b0da44f69695f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 16 Aug 2019 05:41:44 +0000 Subject: powerpc/mm: ppc 603 doesn't need update_mmu_cache() On powerpc 603, there is no hash table so get out of update_mmu_cache() early. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/6133e0f115d955fac4061536dab0fa7480a1c433.1565933217.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/book3s32/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 5d7d35eb96fb..50a1991d257f 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -319,6 +319,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea) void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return; /* * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held -- cgit v1.2.3 From 7ab0b7cb8951d4095d73e203759b74d41916e455 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 16 Aug 2019 07:52:20 +0000 Subject: powerpc/32: Add warning on misaligned copy_page() or clear_page() copy_page() and clear_page() expect page aligned destination, and use dcbz instruction to clear entire cache lines based on the assumption that the destination is cache aligned. As shown during analysis of a bug in BTRFS filesystem, a misaligned copy_page() can create bugs that are difficult to locate (see Link). Add an explicit WARNING when copy_page() or clear_page() are called with misaligned destination. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=204371 Link: https://lore.kernel.org/r/c6cea38f90480268d439ca44a645647e260fff09.1565941808.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/page_32.h | 4 ++++ arch/powerpc/kernel/misc_32.S | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index 683dfbc67ca8..d64dfe3ac712 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -40,6 +40,8 @@ typedef unsigned long long pte_basic_t; typedef unsigned long pte_basic_t; #endif +#include + /* * Clear page using the dcbz instruction, which doesn't cause any * memory traffic (except to write out any cache lines which get @@ -49,6 +51,8 @@ static inline void clear_page(void *addr) { unsigned int i; + WARN_ON((unsigned long)addr & (L1_CACHE_BYTES - 1)); + for (i = 0; i < PAGE_SIZE / L1_CACHE_BYTES; i++, addr += L1_CACHE_BYTES) dcbz(addr); } diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index fe4bd321730e..02d90e1ebf65 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -452,7 +452,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) stwu r9,16(r3) _GLOBAL(copy_page) + rlwinm r5, r3, 0, L1_CACHE_BYTES - 1 addi r3,r3,-4 + +0: twnei r5, 0 /* WARN if r3 is not cache aligned */ + EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + addi r4,r4,-4 li r5,4 -- cgit v1.2.3 From e354d7dc81d0e81bea33165f381aff1eda45f5d9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 13 Jun 2019 00:03:17 +1000 Subject: powerpc/64: allow compiler to cache 'current' current may be cached by the compiler, so remove the volatile asm restriction. This results in better generated code, as well as being smaller and fewer dependent loads, it can avoid store-hit-load flushes like this one that shows up in irq_exit(): preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && ...) Which ends up as: ((struct thread_info *)current)->preempt_count -= HARDIRQ_OFFSET; if (((struct thread_info *)current)->preempt_count ... Evaluating current twice presently means it has to be loaded twice, and here gcc happens to pick a different register each time, then preempt_count is accessed via that base register: 1058: ld r10,2392(r13) <-- current 105c: lwz r9,0(r10) <-- preempt_count 1060: addis r9,r9,-1 1064: stw r9,0(r10) <-- preempt_count 1068: ld r9,2392(r13) <-- current 106c: lwz r9,0(r9) <-- preempt_count 1070: rlwinm. r9,r9,0,11,23 1074: bne 1090 This can frustrate store-hit-load detection heuristics and cause flushes. Allowing the compiler to cache current in a reigster with this patch results in the same base register being used for all accesses, which is more likely to be detected as an alias: 1058: ld r31,2392(r13) ... 1070: lwz r9,0(r31) 1074: addis r9,r9,-1 1078: stw r9,0(r31) 107c: lwz r9,0(r31) 1080: rlwinm. r9,r9,0,11,23 1084: bne 10a0 Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190612140317.24490-1-npiggin@gmail.com --- arch/powerpc/include/asm/current.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/current.h b/arch/powerpc/include/asm/current.h index 297827b76169..bbfb94800415 100644 --- a/arch/powerpc/include/asm/current.h +++ b/arch/powerpc/include/asm/current.h @@ -16,7 +16,8 @@ static inline struct task_struct *get_current(void) { struct task_struct *task; - __asm__ __volatile__("ld %0,%1(13)" + /* get_current can be cached by the compiler, so no volatile */ + asm ("ld %0,%1(13)" : "=r" (task) : "i" (offsetof(struct paca_struct, __current))); -- cgit v1.2.3 From 8f51e3929470942e6a8744061254fdeef646cd36 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 24 Jul 2019 18:46:34 +1000 Subject: powerpc/64s/radix: Fix memory hotplug section page table creation create_physical_mapping expects physical addresses, but creating and splitting these mappings after boot is supplying virtual (effective) addresses. This can be irritated by booting with mem= to limit memory then probing an unused physical memory range: echo > /sys/devices/system/memory/probe This mostly works by accident, firstly because __va(__va(x)) == __va(x) so the virtual address does not get corrupted. Secondly because pfn_pte masks out the upper bits of the pfn beyond the physical address limit, so a pfn constructed with a 0xc000000000000000 virtual linear address will be masked back to the correct physical address in the pte. Fixes: 6cc27341b21a8 ("powerpc/mm: add radix__create_section_mapping()") Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190724084638.24982-1-npiggin@gmail.com --- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b4ca9e95e678..c5cc16ab1954 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -902,7 +902,7 @@ int __meminit radix__create_section_mapping(unsigned long start, unsigned long e return -1; } - return create_physical_mapping(start, end, nid); + return create_physical_mapping(__pa(start), __pa(end), nid); } int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) -- cgit v1.2.3 From 31f210cf42d4b308eacef89b6cb0b1459338b8de Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 24 Jul 2019 18:46:35 +1000 Subject: powerpc/64s/radix: Fix memory hot-unplug page table split create_physical_mapping expects physical addresses, but splitting these mapping on hot unplug is supplying virtual (effective) addresses. Fixes: 4dd5f8a99e791 ("powerpc/mm/radix: Split linear mapping on hot-unplug") Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190724084638.24982-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/radix_pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index c5cc16ab1954..2204d8eeb784 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -737,8 +737,8 @@ static int __meminit stop_machine_change_mapping(void *data) spin_unlock(&init_mm.page_table_lock); pte_clear(&init_mm, params->aligned_start, params->pte); - create_physical_mapping(params->aligned_start, params->start, -1); - create_physical_mapping(params->end, params->aligned_end, -1); + create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1); + create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1); spin_lock(&init_mm.page_table_lock); return 0; } -- cgit v1.2.3 From 10c4bd7cd28e77aeb8cfa65b23cb3c632ede2a49 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 24 Jul 2019 18:46:36 +1000 Subject: powerpc/perf: fix imc allocation failure handling The alloc_pages_node return value should be tested for failure before being passed to page_address. Tested-by: Anju T Sudhakar Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190724084638.24982-3-npiggin@gmail.com --- arch/powerpc/perf/imc-pmu.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index dea243185ea4..cb50a9e1fd2d 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -577,6 +577,7 @@ static int core_imc_mem_init(int cpu, int size) { int nid, rc = 0, core_id = (cpu / threads_per_core); struct imc_mem_info *mem_info; + struct page *page; /* * alloc_pages_node() will allocate memory for core in the @@ -587,11 +588,12 @@ static int core_imc_mem_init(int cpu, int size) mem_info->id = core_id; /* We need only vbase for core counters */ - mem_info->vbase = page_address(alloc_pages_node(nid, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | - __GFP_NOWARN, get_order(size))); - if (!mem_info->vbase) + page = alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size)); + if (!page) return -ENOMEM; + mem_info->vbase = page_address(page); /* Init the mutex */ core_imc_refc[core_id].id = core_id; @@ -849,15 +851,17 @@ static int thread_imc_mem_alloc(int cpu_id, int size) int nid = cpu_to_node(cpu_id); if (!local_mem) { + struct page *page; /* * This case could happen only once at start, since we dont * free the memory in cpu offline path. */ - local_mem = page_address(alloc_pages_node(nid, + page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | - __GFP_NOWARN, get_order(size))); - if (!local_mem) + __GFP_NOWARN, get_order(size)); + if (!page) return -ENOMEM; + local_mem = page_address(page); per_cpu(thread_imc_mem, cpu_id) = local_mem; } @@ -1095,11 +1099,14 @@ static int trace_imc_mem_alloc(int cpu_id, int size) int core_id = (cpu_id / threads_per_core); if (!local_mem) { - local_mem = page_address(alloc_pages_node(phys_id, - GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | - __GFP_NOWARN, get_order(size))); - if (!local_mem) + struct page *page; + + page = alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size)); + if (!page) return -ENOMEM; + local_mem = page_address(page); per_cpu(trace_imc_mem, cpu_id) = local_mem; /* Initialise the counters for trace mode */ -- cgit v1.2.3 From 4dd7554a6456d124c85e0a4ad156625b71390b5c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 24 Jul 2019 18:46:37 +1000 Subject: powerpc/64: Add VIRTUAL_BUG_ON checks for __va and __pa addresses Ensure __va is given a physical address below PAGE_OFFSET, and __pa is given a virtual address above PAGE_OFFSET. Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190724084638.24982-4-npiggin@gmail.com --- arch/powerpc/include/asm/page.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 0d52f57fca04..c8bb14ff4713 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -215,9 +215,19 @@ static inline bool pfn_valid(unsigned long pfn) /* * gcc miscompiles (unsigned long)(&static_var) - PAGE_OFFSET * with -mcmodel=medium, so we use & and | instead of - and + on 64-bit. + * This also results in better code generation. */ -#define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET)) -#define __pa(x) ((unsigned long)(x) & 0x0fffffffffffffffUL) +#define __va(x) \ +({ \ + VIRTUAL_BUG_ON((unsigned long)(x) >= PAGE_OFFSET); \ + (void *)(unsigned long)((phys_addr_t)(x) | PAGE_OFFSET); \ +}) + +#define __pa(x) \ +({ \ + VIRTUAL_BUG_ON((unsigned long)(x) < PAGE_OFFSET); \ + (unsigned long)(x) & 0x0fffffffffffffffUL; \ +}) #else /* 32-bit, non book E */ #define __va(x) ((void *)(unsigned long)((phys_addr_t)(x) + PAGE_OFFSET - MEMORY_START)) -- cgit v1.2.3 From 6bb25170d7a44ef0ed9677814600f0785e7421d1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 24 Jul 2019 18:46:38 +1000 Subject: powerpc/64s/radix: Remove redundant pfn_pte bitop, add VM_BUG_ON pfn_pte is never given a pte above the addressable physical memory limit, so the masking is redundant. In case of a software bug, it is not obviously better to silently truncate the pfn than to corrupt the pte (either one will result in memory corruption or crashes), so there is no reason to add this to the fast path. Add VM_BUG_ON to catch cases where the pfn is invalid. These would catch the create_section_mapping bug fixed by a previous commit. [16885.256466] ------------[ cut here ]------------ [16885.256492] kernel BUG at arch/powerpc/include/asm/book3s/64/pgtable.h:612! cpu 0x0: Vector: 700 (Program Check) at [c0000000ee0a36d0] pc: c000000000080738: __map_kernel_page+0x248/0x6f0 lr: c000000000080ac0: __map_kernel_page+0x5d0/0x6f0 sp: c0000000ee0a3960 msr: 9000000000029033 current = 0xc0000000ec63b400 paca = 0xc0000000017f0000 irqmask: 0x03 irq_happened: 0x01 pid = 85, comm = sh kernel BUG at arch/powerpc/include/asm/book3s/64/pgtable.h:612! Linux version 5.3.0-rc1-00001-g0fe93e5f3394 enter ? for help [c0000000ee0a3a00] c000000000d37378 create_physical_mapping+0x260/0x360 [c0000000ee0a3b10] c000000000d370bc create_section_mapping+0x1c/0x3c [c0000000ee0a3b30] c000000000071f54 arch_add_memory+0x74/0x130 Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190724084638.24982-5-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/pgtable.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8308f32e9782..8e47fb85dfa6 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -608,8 +608,10 @@ static inline bool pte_access_permitted(pte_t pte, bool write) */ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) { - return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | - pgprot_val(pgprot)); + VM_BUG_ON(pfn >> (64 - PAGE_SHIFT)); + VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK); + + return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot)); } static inline unsigned long pte_pfn(pte_t pte) -- cgit v1.2.3 From 415480dce2ef03bb8335deebd2f402f475443ce0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 19 Aug 2019 06:40:25 +0000 Subject: powerpc/603: Fix handling of the DIRTY flag If a page is already mapped RW without the DIRTY flag, the DIRTY flag is never set and a TLB store miss exception is taken forever. This is easily reproduced with the following app: void main(void) { volatile char *ptr = mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); *ptr = *ptr; } When DIRTY flag is not set, bail out of TLB miss handler and take a minor page fault which will set the DIRTY flag. Fixes: f8b58c64eaef ("powerpc/603: let's handle PAGE_DIRTY directly") Cc: stable@vger.kernel.org # v5.1+ Reported-by: Doug Crawford Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/80432f71194d7ee75b2f5043ecf1501cf1cca1f3.1566196646.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index c8b4f7ed318c..9e6f01abb31e 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -557,9 +557,9 @@ DataStoreTLBMiss: cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR #ifdef CONFIG_SWAP - li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED #else - li r1, _PAGE_RW | _PAGE_PRESENT + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT #endif bge- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ -- cgit v1.2.3 From a6717c01ddc259f6f73364779df058e2c67309f8 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 2 Aug 2019 14:29:24 -0500 Subject: powerpc/rtas: use device model APIs and serialization during LPM The LPAR migration implementation and userspace-initiated cpu hotplug can interleave their executions like so: 1. Set cpu 7 offline via sysfs. 2. Begin a partition migration, whose implementation requires the OS to ensure all present cpus are online; cpu 7 is onlined: rtas_ibm_suspend_me -> rtas_online_cpus_mask -> cpu_up This sets cpu 7 online in all respects except for the cpu's corresponding struct device; dev->offline remains true. 3. Set cpu 7 online via sysfs. _cpu_up() determines that cpu 7 is already online and returns success. The driver core (device_online) sets dev->offline = false. 4. The migration completes and restores cpu 7 to offline state: rtas_ibm_suspend_me -> rtas_offline_cpus_mask -> cpu_down This leaves cpu7 in a state where the driver core considers the cpu device online, but in all other respects it is offline and unused. Attempts to online the cpu via sysfs appear to succeed but the driver core actually does not pass the request to the lower-level cpuhp support code. This makes the cpu unusable until the cpu device is manually set offline and then online again via sysfs. Instead of directly calling cpu_up/cpu_down, the migration code should use the higher-level device core APIs to maintain consistent state and serialize operations. Fixes: 120496ac2d2d ("powerpc: Bring all threads online prior to migration/hibernation") Signed-off-by: Nathan Lynch Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802192926.19277-2-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 49159bb38949..ef290d4036ba 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -871,15 +871,17 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, return 0; for_each_cpu(cpu, cpus) { + struct device *dev = get_cpu_device(cpu); + switch (state) { case DOWN: - cpuret = cpu_down(cpu); + cpuret = device_offline(dev); break; case UP: - cpuret = cpu_up(cpu); + cpuret = device_online(dev); break; } - if (cpuret) { + if (cpuret < 0) { pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", __func__, ((state == UP) ? "up" : "down"), @@ -966,6 +968,8 @@ int rtas_ibm_suspend_me(u64 handle) data.token = rtas_token("ibm,suspend-me"); data.complete = &done; + lock_device_hotplug(); + /* All present CPUs must be online */ cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); cpuret = rtas_online_cpus_mask(offline_mask); @@ -1004,6 +1008,7 @@ out_hotplug_enable: __func__); out: + unlock_device_hotplug(); free_cpumask_var(offline_mask); return atomic_read(&data.error); } -- cgit v1.2.3 From 10e4850d7c7f2af2e5c40520b8caf73bf9d7e2d1 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 2 Aug 2019 14:29:25 -0500 Subject: powerpc/rtas: allow rescheduling while changing cpu states rtas_cpu_state_change_mask() potentially operates on scores of cpus, so explicitly allow rescheduling in the loop body. Signed-off-by: Nathan Lynch Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802192926.19277-3-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index ef290d4036ba..c5fa251b8950 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -898,6 +899,7 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, cpumask_clear_cpu(cpu, cpus); } } + cond_resched(); } return ret; -- cgit v1.2.3 From ccfb5bd71d3d1228090a8633800ae7cdf42a94ac Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 2 Aug 2019 14:29:26 -0500 Subject: powerpc/pseries/mobility: use cond_resched when updating device tree After a partition migration, pseries_devicetree_update() processes changes to the device tree communicated from the platform to Linux. This is a relatively heavyweight operation, with multiple device tree searches, memory allocations, and conversations with partition firmware. There's a few levels of nested loops which are bounded only by decisions made by the platform, outside of Linux's control, and indeed we have seen RCU stalls on large systems while executing this call graph. Use cond_resched() in these loops so that the cpu is yielded when needed. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802192926.19277-4-nathanl@linux.ibm.com --- arch/powerpc/platforms/pseries/mobility.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index fe812bebdf5e..b571285f6c14 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -207,7 +208,11 @@ static int update_dt_node(__be32 phandle, s32 scope) prop_data += vd; } + + cond_resched(); } + + cond_resched(); } while (rtas_rc == 1); of_node_put(dn); @@ -310,8 +315,12 @@ int pseries_devicetree_update(s32 scope) add_dt_node(phandle, drc_index); break; } + + cond_resched(); } } + + cond_resched(); } while (rc == 1); kfree(rtas_buf); -- cgit v1.2.3 From b5bda6263cad9a927e1a4edb7493d542da0c1410 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 20 Aug 2019 13:43:46 +0530 Subject: powerpc/mce: Schedule work from irq_work schedule_work() cannot be called from MCE exception context as MCE can interrupt even in interrupt disabled context. Fixes: 733e4a4c4467 ("powerpc/mce: hookup memory_failure for UE errors") Cc: stable@vger.kernel.org # v4.15+ Reviewed-by: Mahesh Salgaonkar Reviewed-by: Nicholas Piggin Acked-by: Balbir Singh Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-2-santosh@fossix.org --- arch/powerpc/kernel/mce.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index b18df633eae9..cff31d4a501f 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -33,6 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_ue_event_queue); static void machine_check_process_queued_event(struct irq_work *work); +static void machine_check_ue_irq_work(struct irq_work *work); void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); @@ -40,6 +41,10 @@ static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +static struct irq_work mce_ue_event_irq_work = { + .func = machine_check_ue_irq_work, +}; + DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static void mce_set_error_info(struct machine_check_event *mce, @@ -199,6 +204,10 @@ void release_mce_event(void) get_mce_event(NULL, true); } +static void machine_check_ue_irq_work(struct irq_work *work) +{ + schedule_work(&mce_ue_event_work); +} /* * Queue up the MCE event which then can be handled later. @@ -216,7 +225,7 @@ void machine_check_ue_event(struct machine_check_event *evt) memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); /* Queue work to process this event later. */ - schedule_work(&mce_ue_event_work); + irq_work_queue(&mce_ue_event_irq_work); } /* -- cgit v1.2.3 From 99ead78afd1128bfcebe7f88f3b102fb2da09aee Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 20 Aug 2019 13:43:47 +0530 Subject: powerpc/mce: Fix MCE handling for huge pages The current code would fail on huge pages addresses, since the shift would be incorrect. Use the correct page shift value returned by __find_linux_pte() to get the correct physical address. The code is more generic and can handle both regular and compound pages. Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors") Signed-off-by: Balbir Singh [arbab@linux.ibm.com: Fixup pseries_do_memory_failure()] Signed-off-by: Reza Arbab Tested-by: Mahesh Salgaonkar Signed-off-by: Santosh Sivaraj Cc: stable@vger.kernel.org # v4.15+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-3-santosh@fossix.org --- arch/powerpc/kernel/mce_power.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index a814d2dfb5b0..714a98e0927f 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -26,6 +26,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { pte_t *ptep; + unsigned int shift; unsigned long flags; struct mm_struct *mm; @@ -35,13 +36,18 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) mm = &init_mm; local_irq_save(flags); - if (mm == current->mm) - ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); - else - ptep = find_init_mm_pte(addr, NULL); + ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift); local_irq_restore(flags); + if (!ptep || pte_special(*ptep)) return ULONG_MAX; + + if (shift > PAGE_SHIFT) { + unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; + + return pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask))); + } + return pte_pfn(*ptep); } @@ -344,7 +350,7 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, false, 0, 0, 0, 0, 0 } }; -static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, +static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, uint64_t *phys_addr) { /* @@ -541,7 +547,8 @@ static int mce_handle_derror(struct pt_regs *regs, * kernel/exception-64s.h */ if (get_paca()->in_mce < MAX_MCE_DEPTH) - mce_find_instr_ea_and_pfn(regs, addr, phys_addr); + mce_find_instr_ea_and_phys(regs, addr, + phys_addr); } found = 1; } -- cgit v1.2.3 From 1a1715f516fd7fcfedffee5978ec4c07c5164470 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Tue, 20 Aug 2019 13:43:48 +0530 Subject: powerpc/mce: Make machine_check_ue_event() static The function doesn't get used outside this file, so make it static. Signed-off-by: Reza Arbab Signed-off-by: Santosh Sivaraj Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-4-santosh@fossix.org --- arch/powerpc/kernel/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index cff31d4a501f..a3b122a685a5 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -34,7 +34,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], static void machine_check_process_queued_event(struct irq_work *work); static void machine_check_ue_irq_work(struct irq_work *work); -void machine_check_ue_event(struct machine_check_event *evt); +static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); static struct irq_work mce_event_process_work = { @@ -212,7 +212,7 @@ static void machine_check_ue_irq_work(struct irq_work *work) /* * Queue up the MCE event which then can be handled later. */ -void machine_check_ue_event(struct machine_check_event *evt) +static void machine_check_ue_event(struct machine_check_event *evt) { int index; -- cgit v1.2.3 From 49ec9177b8ec9c081850744468811e708a7c9841 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 20 Aug 2019 13:43:49 +0530 Subject: extable: Add function to search only kernel exception table Certain architecture specific operating modes (e.g., in powerpc machine check handler that is unable to access vmalloc memory), the search_exception_tables cannot be called because it also searches the module exception tables if entry is not found in the kernel exception table. Signed-off-by: Santosh Sivaraj Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-5-santosh@fossix.org --- include/linux/extable.h | 2 ++ kernel/extable.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/extable.h b/include/linux/extable.h index 41c5b3a25f67..81ecfaa83ad3 100644 --- a/include/linux/extable.h +++ b/include/linux/extable.h @@ -19,6 +19,8 @@ void trim_init_extable(struct module *m); /* Given an address, look for it in the exception tables */ const struct exception_table_entry *search_exception_tables(unsigned long add); +const struct exception_table_entry * +search_kernel_exception_table(unsigned long addr); #ifdef CONFIG_MODULES /* For extable.c to search modules' exception tables. */ diff --git a/kernel/extable.c b/kernel/extable.c index e23cce6e6092..f6c9406eec7d 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -40,13 +40,20 @@ void __init sort_main_extable(void) } } +/* Given an address, look for it in the kernel exception table */ +const +struct exception_table_entry *search_kernel_exception_table(unsigned long addr) +{ + return search_extable(__start___ex_table, + __stop___ex_table - __start___ex_table, addr); +} + /* Given an address, look for it in the exception tables. */ const struct exception_table_entry *search_exception_tables(unsigned long addr) { const struct exception_table_entry *e; - e = search_extable(__start___ex_table, - __stop___ex_table - __start___ex_table, addr); + e = search_kernel_exception_table(addr); if (!e) e = search_module_extables(addr); return e; -- cgit v1.2.3 From 895e3dceeb97855dc9990136cbb80a842fe581aa Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 20 Aug 2019 13:43:50 +0530 Subject: powerpc/mce: Handle UE event for memcpy_mcsafe If we take a UE on one of the instructions with a fixup entry, set nip to continue execution at the fixup entry. Stop processing the event further or print it. Co-developed-by: Reza Arbab Signed-off-by: Reza Arbab Signed-off-by: Balbir Singh Reviewed-by: Mahesh Salgaonkar Reviewed-by: Nicholas Piggin Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-6-santosh@fossix.org --- arch/powerpc/include/asm/mce.h | 4 +++- arch/powerpc/kernel/mce.c | 16 ++++++++++++++++ arch/powerpc/kernel/mce_power.c | 15 +++++++++++++-- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index a4c6a74ad2fb..19a33707d5ef 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -122,7 +122,8 @@ struct machine_check_event { enum MCE_UeErrorType ue_error_type:8; u8 effective_address_provided; u8 physical_address_provided; - u8 reserved_1[5]; + u8 ignore_event; + u8 reserved_1[4]; u64 effective_address; u64 physical_address; u8 reserved_2[8]; @@ -193,6 +194,7 @@ struct mce_error_info { enum MCE_Initiator initiator:8; enum MCE_ErrorClass error_class:8; bool sync_error; + bool ignore_event; }; #define MAX_MC_EVT 100 diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index a3b122a685a5..ec4b3e1087be 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -149,6 +149,7 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; + mce->u.ue_error.ignore_event = mce_err->ignore_event; machine_check_ue_event(mce); } } @@ -266,8 +267,17 @@ static void machine_process_ue_event(struct work_struct *work) /* * This should probably queued elsewhere, but * oh! well + * + * Don't report this machine check because the caller has a + * asked us to ignore the event, it has a fixup handler which + * will do the appropriate error handling and reporting. */ if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.ignore_event) { + __this_cpu_dec(mce_ue_count); + continue; + } + if (evt->u.ue_error.physical_address_provided) { unsigned long pfn; @@ -301,6 +311,12 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; evt = this_cpu_ptr(&mce_event_queue[index]); + + if (evt->error_type == MCE_ERROR_TYPE_UE && + evt->u.ue_error.ignore_event) { + __this_cpu_dec(mce_queue_count); + continue; + } machine_check_print_event_info(evt, false, false); __this_cpu_dec(mce_queue_count); } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 714a98e0927f..b6cbe3449358 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include /* * Convert an address related to an mm to a PFN. NOTE: we are in real @@ -565,9 +567,18 @@ static int mce_handle_derror(struct pt_regs *regs, return 0; } -static long mce_handle_ue_error(struct pt_regs *regs) +static long mce_handle_ue_error(struct pt_regs *regs, + struct mce_error_info *mce_err) { long handled = 0; + const struct exception_table_entry *entry; + + entry = search_kernel_exception_table(regs->nip); + if (entry) { + mce_err->ignore_event = true; + regs->nip = extable_fixup(entry); + return 1; + } /* * On specific SCOM read via MMIO we may get a machine check @@ -600,7 +611,7 @@ static long mce_handle_error(struct pt_regs *regs, &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) - handled = mce_handle_ue_error(regs); + handled = mce_handle_ue_error(regs, &mce_err); save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); -- cgit v1.2.3 From 4d4a273854c9eac1a1b163830658dc825e183ad5 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 20 Aug 2019 13:43:51 +0530 Subject: powerpc/memcpy: Add memcpy_mcsafe for pmem The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to convert machine check exceptions into a return value on failure in case a machine check exception is encountered during the memcpy. The return value is the number of bytes remaining to be copied. This patch largely borrows from the copyuser_power7 logic and does not add the VMX optimizations, largely to keep the patch simple. If needed those optimizations can be folded in. Signed-off-by: Balbir Singh [arbab@linux.ibm.com: Added symbol export] Co-developed-by: Santosh Sivaraj Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-7-santosh@fossix.org --- arch/powerpc/include/asm/string.h | 2 + arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/memcpy_mcsafe_64.S | 242 ++++++++++++++++++++++++++++++++++++ 3 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 9bf6dffb4090..b72692702f35 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t n); #ifndef CONFIG_KASAN #define __HAVE_ARCH_MEMSET32 #define __HAVE_ARCH_MEMSET64 +#define __HAVE_ARCH_MEMCPY_MCSAFE +extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz); extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t); extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index f030eea2171d..b8de3be10eb4 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ memcpy_power7.o obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ - memcpy_64.o + memcpy_64.o memcpy_mcsafe_64.o obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S new file mode 100644 index 000000000000..949976dc115d --- /dev/null +++ b/arch/powerpc/lib/memcpy_mcsafe_64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) IBM Corporation, 2011 + * Derived from copyuser_power7.s by Anton Blanchard + * Author - Balbir Singh + */ +#include +#include +#include + + .macro err1 +100: + EX_TABLE(100b,.Ldo_err1) + .endm + + .macro err2 +200: + EX_TABLE(200b,.Ldo_err2) + .endm + + .macro err3 +300: EX_TABLE(300b,.Ldone) + .endm + +.Ldo_err2: + ld r22,STK_REG(R22)(r1) + ld r21,STK_REG(R21)(r1) + ld r20,STK_REG(R20)(r1) + ld r19,STK_REG(R19)(r1) + ld r18,STK_REG(R18)(r1) + ld r17,STK_REG(R17)(r1) + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + /* Do a byte by byte copy to get the exact remaining size */ + mtctr r7 +46: +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + bdnz 46b + li r3,0 + blr + +.Ldone: + mfctr r3 + blr + + +_GLOBAL(memcpy_mcsafe) + mr r7,r5 + cmpldi r5,16 + blt .Lshort_copy + +.Lcopy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + subi r7,r7,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + subi r7,r7,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + subi r7,r7,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r8,16(r4) +err2; ld r9,24(r4) +err2; ld r10,32(r4) +err2; ld r11,40(r4) +err2; ld r12,48(r4) +err2; ld r14,56(r4) +err2; ld r15,64(r4) +err2; ld r16,72(r4) +err2; ld r17,80(r4) +err2; ld r18,88(r4) +err2; ld r19,96(r4) +err2; ld r20,104(r4) +err2; ld r21,112(r4) +err2; ld r22,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r8,16(r3) +err2; std r9,24(r3) +err2; std r10,32(r3) +err2; std r11,40(r3) +err2; std r12,48(r3) +err2; std r14,56(r3) +err2; std r15,64(r3) +err2; std r16,72(r3) +err2; std r17,80(r3) +err2; std r18,88(r3) +err2; std r19,96(r3) +err2; std r20,104(r3) +err2; std r21,112(r3) +err2; std r22,120(r3) + addi r3,r3,128 + subi r7,r7,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r8,16(r4) +err2; ld r9,24(r4) +err2; ld r10,32(r4) +err2; ld r11,40(r4) +err2; ld r12,48(r4) +err2; ld r14,56(r4) + addi r4,r4,64 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r8,16(r3) +err2; std r9,24(r3) +err2; std r10,32(r3) +err2; std r11,40(r3) +err2; std r12,48(r3) +err2; std r14,56(r3) + addi r3,r3,64 + subi r7,r7,64 + +7: ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 63B to go */ + bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r8,16(r4) +err1; ld r9,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r8,16(r3) +err1; std r9,24(r3) + addi r3,r3,32 + subi r7,r7,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + subi r7,r7,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + subi r7,r7,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + subi r7,r7,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + subi r7,r7,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +EXPORT_SYMBOL_GPL(memcpy_mcsafe); -- cgit v1.2.3 From 42ac26d253ebe7a5f409443f2fbd239d66b65f57 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 20 Aug 2019 13:43:52 +0530 Subject: powerpc: add machine check safe copy_to_user Use memcpy_mcsafe() implementation to define copy_to_user_mcsafe() Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820081352.8641-8-santosh@fossix.org --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/uaccess.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9bc8ac5b8c96..78f52030e76a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE + select ARCH_HAS_UACCESS_MCSAFE if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 8b03eb44e876..15002b51ff18 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void __user *to, return ret; } +static __always_inline unsigned long __must_check +copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n) +{ + if (likely(check_copy_size(from, n, true))) { + if (access_ok(to, n)) { + allow_write_to_user(to, n); + n = memcpy_mcsafe((void *)to, from, n); + prevent_write_to_user(to, n); + } + } + + return n; +} + extern unsigned long __clear_user(void __user *addr, unsigned long size); static inline unsigned long clear_user(void __user *addr, unsigned long size) -- cgit v1.2.3 From 56347074c5307d7bca5db38eb2c764c64ae57514 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 13 Jul 2019 12:21:06 +0900 Subject: powerpc: remove meaningless KBUILD_ARFLAGS addition The KBUILD_ARFLAGS addition in arch/powerpc/Makefile has never worked in a useful way because it is always overridden by the following code in the top Makefile: # use the deterministic mode of AR if available KBUILD_ARFLAGS := $(call ar-option,D) The code in the top Makefile was added in 2011, by commit 40df759e2b9e ("kbuild: Fix build with binutils <= 2.19"). The KBUILD_ARFLAGS addition for ppc has always been dead code from the beginning. Nobody has reported a problem since 43c9127d94d6 ("powerpc: Add option to use thin archives"), so this code was unneeded. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190713032106.8509-1-yamada.masahiro@socionext.com --- arch/powerpc/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index c345b79414a9..46ed198a3aa3 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -112,7 +112,6 @@ ifeq ($(HAS_BIARCH),y) KBUILD_CFLAGS += -m$(BITS) KBUILD_AFLAGS += -m$(BITS) -Wl,-a$(BITS) KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION) -KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET) endif cflags-$(CONFIG_STACKPROTECTOR) += -mstack-protector-guard=tls -- cgit v1.2.3 From 3f068aae7a958555533847af88705b5629f31600 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:05 +1000 Subject: powerpc/64: Adjust order in pcibios_init() The pcibios_init() function for PowerPC 64 currently calls pci_bus_add_devices() before pcibios_resource_survey(). This means that at boot time, when the pcibios_bus_add_device() hooks are called by pci_bus_add_devices(), device resources have not been allocated and they are unable to perform EEH setup, so a separate pass is needed. This patch adjusts that order so that it will become possible to consolidate the EEH setup work into a single location. The only functional change is to execute pcibios_resource_survey() (excepting ppc_md.pcibios_fixup(), see below) before pci_bus_add_devices() instead of after it. Because pcibios_scan_phb() and pci_bus_add_devices() are called together in a loop, this must be broken into one loop for each call. Then the call to pcibios_resource_survey() is moved up in between them. This changes the ordering but because pcibios_resource_survey() also calls ppc_md.pcibios_fixup(), that call is extracted out into pcibios_init() to where pcibios_resource_survey() was, so that it is not moved. The only other caller of pcibios_resource_survey() is the PowerPC 32 version of pcibios_init(), and therefore, that is modified to call ppc_md.pcibios_fixup() right after pcibios_resource_survey() so that there is no functional change there at all. The re-arrangement will cause very few side-effects because at this stage in the boot, pci_bus_add_devices() does very little: - pci_create_sysfs_dev_files() does nothing (no sysfs yet) - pci_proc_attach_device() does nothing (no proc yet) - device_attach() does nothing (no drivers yet) This leaves only the pci_final_fixup calls, D3 support, and marking the device as added. Of those, only the pci_final_fixup calls have the potential to be affected by resource allocation. The only pci_final_fixup handlers that touch resources seem to be one for x86 (pci_amd_enable_64bit_bar()), and a PowerPC 32 platform driver (quirk_final_uli1575()), neither of which use this pcibios_init() function. Even if they did, it would almost certainly be a bug, under the current ordering, to rely on or make changes to resources before they were allocated. Signed-off-by: Sam Bobroff Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4506b0489eabd0921a3587d90bd44c7683f3472d.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/kernel/pci-common.c | 4 ---- arch/powerpc/kernel/pci_32.c | 4 ++++ arch/powerpc/kernel/pci_64.c | 12 +++++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index f627e15bb43c..1c448cf25506 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1379,10 +1379,6 @@ void __init pcibios_resource_survey(void) pr_debug("PCI: Assigning unassigned resources...\n"); pci_assign_unassigned_resources(); } - - /* Call machine dependent fixup */ - if (ppc_md.pcibios_fixup) - ppc_md.pcibios_fixup(); } /* This is used by the PCI hotplug driver to allocate resource diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 50942a1d1a5f..b49e1060a3bf 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -263,6 +263,10 @@ static int __init pcibios_init(void) /* Call common code to handle resource allocation */ pcibios_resource_survey(); + /* Call machine dependent fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + /* Call machine dependent post-init code */ if (ppc_md.pcibios_after_init) ppc_md.pcibios_after_init(); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index b7030b1189d0..f83d1f69b1dd 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -54,14 +54,20 @@ static int __init pcibios_init(void) pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); /* Scan all of the recorded PCI controllers. */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) pcibios_scan_phb(hose); - pci_bus_add_devices(hose->bus); - } /* Call common code to handle resource allocation */ pcibios_resource_survey(); + /* Add devices. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + pci_bus_add_devices(hose->bus); + + /* Call machine dependent fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + printk(KERN_DEBUG "PCI: Probing PCI hardware done\n"); return 0; -- cgit v1.2.3 From aa06e3d60e245284d1e55497eb3108828092818d Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:06 +1000 Subject: powerpc/eeh: Clear stale EEH_DEV_NO_HANDLER flag The EEH_DEV_NO_HANDLER flag is used by the EEH system to prevent the use of driver callbacks in drivers that have been bound part way through the recovery process. This is necessary to prevent later stage handlers from being called when the earlier stage handlers haven't, which can be confusing for drivers. However, the flag is set for all devices that are added after boot time and only cleared at the end of the EEH recovery process. This results in hot plugged devices erroneously having the flag set during the first recovery after they are added (causing their driver's handlers to be incorrectly ignored). To remedy this, clear the flag at the beginning of recovery processing. The flag is still cleared at the end of recovery processing, although it is no longer really necessary. Also clear the flag during eeh_handle_special_event(), for the same reasons. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b8ca5629d27de74c957d4f4b250177d1b6fc4bbd.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/kernel/eeh_driver.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 89623962c727..1fbe541856f5 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -793,6 +793,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe) result = PCI_ERS_RESULT_DISCONNECT; } + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + edev->mode &= ~EEH_DEV_NO_HANDLER; + /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs * to accomplish the reset. Each child gets a report of the @@ -981,7 +985,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) */ void eeh_handle_special_event(void) { - struct eeh_pe *pe, *phb_pe; + struct eeh_pe *pe, *phb_pe, *tmp_pe; + struct eeh_dev *edev, *tmp_edev; struct pci_bus *bus; struct pci_controller *hose; unsigned long flags; @@ -1050,6 +1055,10 @@ void eeh_handle_special_event(void) (phb_pe->state & EEH_PE_RECOVERING)) continue; + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) + edev->mode &= ~EEH_DEV_NO_HANDLER; + /* Notify all devices to be down */ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_set_channel_state(pe, pci_channel_io_perm_failure); -- cgit v1.2.3 From 617082a4817a4354fa3de05c80b5f6088e2083b7 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:07 +1000 Subject: powerpc/eeh: Improve debug messages around device addition Also remove useless comment. Signed-off-by: Sam Bobroff Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/59db84f4bf94718a12f206bc923ac797d47e4cc1.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/kernel/eeh.c | 2 +- arch/powerpc/platforms/powernv/eeh-powernv.c | 14 ++++++++++---- arch/powerpc/platforms/pseries/eeh_pseries.c | 23 +++++++++++++++++------ 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c0e4b73191f3..d187d2b290a8 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1198,7 +1198,7 @@ void eeh_add_device_late(struct pci_dev *dev) pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); edev = pdn_to_eeh_dev(pdn); if (edev->pdev == dev) { - pr_debug("EEH: Already referenced !\n"); + pr_debug("EEH: Device %s already referenced!\n", pci_name(dev)); return; } diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 620a986209f5..97461d49254e 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -44,10 +44,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) if (!pdev->is_virtfn) return; - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev)); eeh_add_device_early(pdn); eeh_add_device_late(pdev); eeh_sysfs_add_device(pdev); @@ -364,6 +361,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) int ret; int config_addr = (pdn->busno << 8) | (pdn->devfn); + pr_debug("%s: probing %04x:%02x:%02x.%01x\n", + __func__, hose->global_number, pdn->busno, + PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + /* * When probing the root bridge, which doesn't have any * subordinate PCI devices. We don't have OF node for @@ -458,6 +459,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) /* Save memory bars */ eeh_save_bars(edev); + pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n", + __func__, pdn->busno, PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), edev->pe->phb->global_number, + edev->pe->addr); + return NULL; } diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 9edae1863e2f..83ee69d22022 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -52,6 +52,8 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) if (!pdev->is_virtfn) return; + pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev)); + pdn->device_id = pdev->device; pdn->vendor_id = pdev->vendor; pdn->class_code = pdev->class; @@ -238,6 +240,10 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) int enable = 0; int ret; + pr_debug("%s: probing %04x:%02x:%02x.%01x\n", + __func__, pdn->phb->global_number, pdn->busno, + PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + /* Retrieve OF node and eeh device */ edev = pdn_to_eeh_dev(pdn); if (!edev || edev->pe) @@ -281,7 +287,12 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) /* Enable EEH on the device */ ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); - if (!ret) { + if (ret) { + pr_debug("%s: EEH failed to enable on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n", + __func__, pdn->busno, PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), pe.phb->global_number, + pe.addr, ret); + } else { /* Retrieve PE address */ edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); pe.addr = edev->pe_config_addr; @@ -297,11 +308,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if (enable) { eeh_add_flag(EEH_ENABLED); eeh_add_to_parent_pe(edev); - - pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n", - __func__, pdn->busno, PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), pe.phb->global_number, - pe.addr); } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) && (pdn_to_eeh_dev(pdn->parent))->pe) { /* This device doesn't support EEH, but it may have an @@ -310,6 +316,11 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; eeh_add_to_parent_pe(edev); } + pr_debug("%s: EEH %s on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n", + __func__, (enable ? "enabled" : "unsupported"), + pdn->busno, PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), pe.phb->global_number, + pe.addr, ret); } /* Save memory bars */ -- cgit v1.2.3 From 685a0bc00abcf1d40d160eaafab9989f565ab2b5 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:08 +1000 Subject: powerpc/eeh: Initialize EEH address cache earlier The EEH address cache is currently initialized and populated by a single function: eeh_addr_cache_build(). While the initial population of the cache can only be done once resources are allocated, initialization (just setting up a spinlock) could be done much earlier. So move the initialization step into a separate function and call it from a core_initcall (rather than a subsys initcall). This will allow future work to make use of the cache during boot time PCI scanning. Signed-off-by: Sam Bobroff Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0557206741bffee76cdfff042f65321f6f7a5b41.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/include/asm/eeh.h | 3 +++ arch/powerpc/kernel/eeh.c | 2 ++ arch/powerpc/kernel/eeh_cache.c | 13 +++++++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 8aa7c76c2130..33e797eb0e6a 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -274,6 +274,7 @@ int __init eeh_ops_register(struct eeh_ops *ops); int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); int eeh_dev_check_failure(struct eeh_dev *edev); +void eeh_addr_cache_init(void); void eeh_addr_cache_build(void); void eeh_add_device_early(struct pci_dn *); void eeh_add_device_tree_early(struct pci_dn *); @@ -332,6 +333,8 @@ static inline int eeh_check_failure(const volatile void __iomem *token) #define eeh_dev_check_failure(x) (0) +static inline void eeh_addr_cache_init(void) { } + static inline void eeh_addr_cache_build(void) { } static inline void eeh_add_device_early(struct pci_dn *pdn) { } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index d187d2b290a8..22f646176abb 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1120,6 +1120,8 @@ static int eeh_init(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) eeh_dev_phb_init_dynamic(hose); + eeh_addr_cache_init(); + /* Initialize EEH event */ return eeh_event_init(); } diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 05ffd32b3416..75f0d66dee4b 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -257,6 +257,17 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev) spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); } +/** + * eeh_addr_cache_init - Initialize a cache of I/O addresses + * + * Initialize a cache of pci i/o addresses. This cache will be used to + * find the pci device that corresponds to a given address. + */ +void eeh_addr_cache_init(void) +{ + spin_lock_init(&pci_io_addr_cache_root.piar_lock); +} + /** * eeh_addr_cache_build - Build a cache of I/O addresses * @@ -272,8 +283,6 @@ void eeh_addr_cache_build(void) struct eeh_dev *edev; struct pci_dev *dev = NULL; - spin_lock_init(&pci_io_addr_cache_root.piar_lock); - for_each_pci_dev(dev) { pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); if (!pdn) -- cgit v1.2.3 From b905f8cdca7725e750a84f7188ea6821750124c3 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:09 +1000 Subject: powerpc/eeh: EEH for pSeries hot plug On PowerNV and pSeries, devices currently acquire EEH support from several different places: Boot-time devices from eeh_probe_devices() and eeh_addr_cache_build(), Virtual Function devices from the pcibios bus add device hooks and hot plugged devices from pci_hp_add_devices() (with other platforms using other methods as well). Unfortunately, pSeries machines currently discover hot plugged devices using pci_rescan_bus(), not pci_hp_add_devices(), and so those devices do not receive EEH support. Rather than adding another case for pci_rescan_bus(), this change widens the scope of the pcibios bus add device hooks so that they can handle all devices. As a side effect this also supports devices discovered after manually rescanning via /sys/bus/pci/rescan. Note that on PowerNV, this change allows the EEH subsystem to become enabled after boot as long as it has not been forced off, which was not previously possible (it was already possible on pSeries). Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/72ae8ae9c54097158894a52de23690448de38ea9.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/kernel/eeh.c | 4 ++- arch/powerpc/platforms/powernv/eeh-powernv.c | 39 +++++++++++++------- arch/powerpc/platforms/pseries/eeh_pseries.c | 54 ++++++++++++++-------------- 3 files changed, 56 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 22f646176abb..abf4c1bb1fab 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1192,7 +1192,7 @@ void eeh_add_device_late(struct pci_dev *dev) struct pci_dn *pdn; struct eeh_dev *edev; - if (!dev || !eeh_enabled()) + if (!dev) return; pr_debug("EEH: Adding device %s\n", pci_name(dev)); @@ -1248,6 +1248,8 @@ void eeh_add_device_tree_late(struct pci_bus *bus) { struct pci_dev *dev; + if (eeh_has_flag(EEH_FORCE_DISABLED)) + return; list_for_each_entry(dev, &bus->devices, bus_list) { eeh_add_device_late(dev); if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 97461d49254e..00c0b81fd61d 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -41,7 +41,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev)); @@ -196,6 +196,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); #endif /* CONFIG_DEBUG_FS */ +void pnv_eeh_enable_phbs(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + /* + * If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; + } +} + /** * pnv_eeh_post_init - EEH platform dependent post initialization * @@ -234,19 +253,11 @@ int pnv_eeh_post_init(void) if (!eeh_enabled()) disable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; - /* - * If EEH is enabled, we're going to rely on that. - * Otherwise, we restore to conventional mechanism - * to clear frozen PE during PCI config access. - */ - if (eeh_enabled()) - phb->flags |= PNV_PHB_FLAG_EEH; - else - phb->flags &= ~PNV_PHB_FLAG_EEH; - /* Create debugfs entries */ #ifdef CONFIG_DEBUG_FS if (phb->has_dbgfs || !phb->dbgfs) @@ -454,7 +465,11 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * Enable EEH explicitly so that we will do EEH check * while accessing I/O stuff */ - eeh_add_flag(EEH_ENABLED); + if (!eeh_has_flag(EEH_ENABLED)) { + enable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + eeh_add_flag(EEH_ENABLED); + } /* Save memory bars */ eeh_save_bars(edev); diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 83ee69d22022..02454c737d11 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -42,44 +42,44 @@ static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_pe; -#ifdef CONFIG_PCI_IOV void pseries_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - struct pci_dn *physfn_pdn; - struct eeh_dev *edev; - if (!pdev->is_virtfn) + if (eeh_has_flag(EEH_FORCE_DISABLED)) return; pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev)); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct pci_dn *physfn_pdn; - pdn->device_id = pdev->device; - pdn->vendor_id = pdev->vendor; - pdn->class_code = pdev->class; - /* - * Last allow unfreeze return code used for retrieval - * by user space in eeh-sysfs to show the last command - * completion from platform. - */ - pdn->last_allow_rc = 0; - physfn_pdn = pci_get_pdn(pdev->physfn); - pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; - edev = pdn_to_eeh_dev(pdn); - - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ + pdn->device_id = pdev->device; + pdn->vendor_id = pdev->vendor; + pdn->class_code = pdev->class; + /* + * Last allow unfreeze return code used for retrieval + * by user space in eeh-sysfs to show the last command + * completion from platform. + */ + pdn->last_allow_rc = 0; + physfn_pdn = pci_get_pdn(pdev->physfn); + pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; + } +#endif eeh_add_device_early(pdn); eeh_add_device_late(pdev); - edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); - eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ - eeh_add_to_parent_pe(edev); /* Add as VF PE type */ - eeh_sysfs_add_device(pdev); +#ifdef CONFIG_PCI_IOV + if (pdev->is_virtfn) { + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); -} + edev->pe_config_addr = (pdn->busno << 16) | (pdn->devfn << 8); + eeh_rmv_from_parent_pe(edev); /* Remove as it is adding to bus pe */ + eeh_add_to_parent_pe(edev); /* Add as VF PE type */ + } #endif + eeh_sysfs_add_device(pdev); +} /* * Buffer for reporting slot-error-detail rtas calls. Its here @@ -146,10 +146,8 @@ static int pseries_eeh_init(void) /* Set EEH probe mode */ eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); -#ifdef CONFIG_PCI_IOV /* Set EEH machine dependent code */ ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device; -#endif return 0; } -- cgit v1.2.3 From c44e4ccadaca5884ac82b6dfffbd693bec3b583e Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:10 +1000 Subject: powerpc/eeh: Refactor around eeh_probe_devices() Now that EEH support for all devices (on PowerNV and pSeries) is provided by the pcibios bus add device hooks, eeh_probe_devices() and eeh_addr_cache_build() are redundant and can be removed. Move the EEH enabled message into it's own function so that it can be called from multiple places. Note that previously on pSeries, useless EEH sysfs files were created for some devices that did not have EEH support and this change prevents them from being created. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/33b0a6339d5ac88693de092d6fba984f2a5add66.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/include/asm/eeh.h | 7 ++---- arch/powerpc/kernel/eeh.c | 27 +++++++++-------------- arch/powerpc/kernel/eeh_cache.c | 32 ---------------------------- arch/powerpc/platforms/powernv/eeh-powernv.c | 4 +--- arch/powerpc/platforms/pseries/pci.c | 3 +-- 5 files changed, 14 insertions(+), 59 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 33e797eb0e6a..1fc2b5e40822 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -269,13 +269,12 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); struct eeh_dev *eeh_dev_init(struct pci_dn *pdn); void eeh_dev_phb_init_dynamic(struct pci_controller *phb); -void eeh_probe_devices(void); +void eeh_show_enabled(void); int __init eeh_ops_register(struct eeh_ops *ops); int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); int eeh_dev_check_failure(struct eeh_dev *edev); void eeh_addr_cache_init(void); -void eeh_addr_cache_build(void); void eeh_add_device_early(struct pci_dn *); void eeh_add_device_tree_early(struct pci_dn *); void eeh_add_device_late(struct pci_dev *); @@ -317,7 +316,7 @@ static inline bool eeh_enabled(void) return false; } -static inline void eeh_probe_devices(void) { } +static inline void eeh_show_enabled(void) { } static inline void *eeh_dev_init(struct pci_dn *pdn, void *data) { @@ -335,8 +334,6 @@ static inline int eeh_check_failure(const volatile void __iomem *token) static inline void eeh_addr_cache_init(void) { } -static inline void eeh_addr_cache_build(void) { } - static inline void eeh_add_device_early(struct pci_dn *pdn) { } static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index abf4c1bb1fab..fc975342e242 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -150,6 +150,16 @@ static int __init eeh_setup(char *str) } __setup("eeh=", eeh_setup); +void eeh_show_enabled(void) +{ + if (eeh_has_flag(EEH_FORCE_DISABLED)) + pr_info("EEH: Recovery disabled by kernel parameter.\n"); + else if (eeh_has_flag(EEH_ENABLED)) + pr_info("EEH: Capable adapter found: recovery enabled.\n"); + else + pr_info("EEH: No capable adapters found: recovery disabled.\n"); +} + /* * This routine captures assorted PCI configuration space data * for the indicated PCI device, and puts them into a buffer @@ -1063,23 +1073,6 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; -void eeh_probe_devices(void) -{ - struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else - pr_info("EEH: No capable adapters found\n"); - -} - /** * eeh_init - EEH initialization * diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 75f0d66dee4b..1e47cd04a1bd 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -268,38 +268,6 @@ void eeh_addr_cache_init(void) spin_lock_init(&pci_io_addr_cache_root.piar_lock); } -/** - * eeh_addr_cache_build - Build a cache of I/O addresses - * - * Build a cache of pci i/o addresses. This cache will be used to - * find the pci device that corresponds to a given address. - * This routine scans all pci busses to build the cache. - * Must be run late in boot process, after the pci controllers - * have been scanned for devices (after all device resources are known). - */ -void eeh_addr_cache_build(void) -{ - struct pci_dn *pdn; - struct eeh_dev *edev; - struct pci_dev *dev = NULL; - - for_each_pci_dev(dev) { - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - if (!pdn) - continue; - - edev = pdn_to_eeh_dev(pdn); - if (!edev) - continue; - - dev->dev.archdata.edev = edev; - edev->pdev = dev; - - eeh_addr_cache_insert_dev(dev); - eeh_sysfs_add_device(dev); - } -} - static int eeh_addr_cache_show(struct seq_file *s, void *v) { struct pci_io_addr_range *piar; diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 00c0b81fd61d..6ed9b6ccafcc 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -229,9 +229,7 @@ int pnv_eeh_post_init(void) struct pnv_phb *phb; int ret = 0; - /* Probe devices & build address cache */ - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 1eae1d09980c..722830978639 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -229,8 +229,7 @@ void __init pSeries_final_fixup(void) pSeries_request_regions(); - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); #ifdef CONFIG_PCI_IOV ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; -- cgit v1.2.3 From 7c33a994d32d89937b23673e7507e8ec1daad893 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 16 Aug 2019 14:48:11 +1000 Subject: powerpc/eeh: Add bdfn field to eeh_dev Preparation for removing pci_dn from the powernv EEH code. The only thing we really use pci_dn for is to get the bdfn of the device for config space accesses, so adding that information to eeh_dev reduces the need to carry around the pci_dn. Signed-off-by: Oliver O'Halloran [SB: Re-wrapped commit message, fixed whitespace damage.] Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e458eb69a1f591d8a120782f23a8506b15d3c654.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/include/asm/eeh.h | 2 ++ arch/powerpc/include/asm/ppc-pci.h | 2 ++ arch/powerpc/kernel/eeh_dev.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 1fc2b5e40822..294efeaf939e 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -121,6 +121,8 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe) struct eeh_dev { int mode; /* EEH mode */ int class_code; /* Class code of the device */ + int bdfn; /* bdfn of device (for cfg ops) */ + struct pci_controller *controller; int pe_config_addr; /* PE config address */ u32 config_space[16]; /* Saved PCI config space */ int pcix_cap; /* Saved PCIx capability */ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index cec2d6409515..72860de205a0 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -74,6 +74,8 @@ static inline const char *eeh_driver_name(struct pci_dev *pdev) #endif /* CONFIG_EEH */ +#define PCI_BUSNO(bdfn) ((bdfn >> 8) & 0xff) + #else /* CONFIG_PCI */ static inline void init_pci_config_tokens(void) { } #endif /* !CONFIG_PCI */ diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index c4317c452d98..7370185c7a05 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -47,6 +47,8 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; + edev->bdfn = (pdn->busno << 8) | pdn->devfn; + edev->controller = pdn->phb; return edev; } -- cgit v1.2.3 From b093f2cbedfbaba6bf1c520fbfcb46403f3c7802 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:12 +1000 Subject: powerpc/eeh: Introduce EEH edev logging macros Now that struct eeh_dev includes the BDFN of it's PCI device, make use of it to replace eeh_edev_info() with a set of dev_dbg()-style macros that only need a struct edev. With the BDFN available without the struct pci_dev, eeh_pci_name() is now unnecessary, so remove it. While only the "info" level function is used here, the others will be used in followup work. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/f90ae9a53d762be7b0ccbad79e62b5a1b4f4996e.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/include/asm/eeh.h | 11 +++++++++++ arch/powerpc/kernel/eeh_driver.c | 17 ----------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 294efeaf939e..56c1b4b0987a 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -138,6 +138,17 @@ struct eeh_dev { struct pci_dev *physfn; /* Associated SRIOV PF */ }; +/* "fmt" must be a simple literal string */ +#define EEH_EDEV_PRINT(level, edev, fmt, ...) \ + pr_##level("PCI %04x:%02x:%02x.%x#%04x: EEH: " fmt, \ + (edev)->controller->global_number, PCI_BUSNO((edev)->bdfn), \ + PCI_SLOT((edev)->bdfn), PCI_FUNC((edev)->bdfn), \ + ((edev)->pe ? (edev)->pe_config_addr : 0xffff), ##__VA_ARGS__) +#define eeh_edev_dbg(edev, fmt, ...) EEH_EDEV_PRINT(debug, (edev), fmt, ##__VA_ARGS__) +#define eeh_edev_info(edev, fmt, ...) EEH_EDEV_PRINT(info, (edev), fmt, ##__VA_ARGS__) +#define eeh_edev_warn(edev, fmt, ...) EEH_EDEV_PRINT(warn, (edev), fmt, ##__VA_ARGS__) +#define eeh_edev_err(edev, fmt, ...) EEH_EDEV_PRINT(err, (edev), fmt, ##__VA_ARGS__) + static inline struct pci_dn *eeh_dev_to_pdn(struct eeh_dev *edev) { return edev ? edev->pdn : NULL; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 1fbe541856f5..ab576bcbe4dd 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -81,23 +81,6 @@ static const char *pci_ers_result_name(enum pci_ers_result result) } }; -static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr, - edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf); - - va_end(args); -} - static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, enum pci_ers_result new) { -- cgit v1.2.3 From 1ff8f36fc770dd2b3eb294312f270db8cf94cc13 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:13 +1000 Subject: powerpc/eeh: Convert log messages to eeh_edev_* macros Convert existing messages, where appropriate, to use the eeh_edev_* logging macros. The only effect should be minor adjustments to the log messages, apart from: - A new message in pseries_eeh_probe() "Probing device" to match the powernv case. - The "Probing device" message in pnv_eeh_probe() is now generated slightly later, which will mean that it is no longer emitted for devices that aren't probed due to the initial checks. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ce505a0a7a4a5b0367f0f40f8b26e7c0a9cf4cb7.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/include/asm/ppc-pci.h | 5 --- arch/powerpc/kernel/eeh.c | 19 +++++------ arch/powerpc/kernel/eeh_cache.c | 8 ++--- arch/powerpc/kernel/eeh_driver.c | 7 ++-- arch/powerpc/kernel/eeh_pe.c | 51 +++++++--------------------- arch/powerpc/platforms/powernv/eeh-powernv.c | 17 +++------- arch/powerpc/platforms/pseries/eeh_pseries.c | 21 ++++-------- 7 files changed, 39 insertions(+), 89 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index 72860de205a0..7f4be5a05eb3 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -62,11 +62,6 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); void eeh_sysfs_add_device(struct pci_dev *pdev); void eeh_sysfs_remove_device(struct pci_dev *pdev); -static inline const char *eeh_pci_name(struct pci_dev *pdev) -{ - return pdev ? pci_name(pdev) : ""; -} - static inline const char *eeh_driver_name(struct pci_dev *pdev) { return (pdev && pdev->driver) ? pdev->driver->name : ""; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index fc975342e242..958e03ca1db6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -470,8 +470,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) /* Access to IO BARs might get this far and still not want checking. */ if (!pe) { eeh_stats.ignored_check++; - pr_debug("EEH: Ignored check for %s\n", - eeh_pci_name(dev)); + eeh_edev_dbg(edev, "Ignored check\n"); return 0; } @@ -511,12 +510,11 @@ int eeh_dev_check_failure(struct eeh_dev *edev) if (dn) location = of_get_property(dn, "ibm,loc-code", NULL); - printk(KERN_ERR "EEH: %d reads ignored for recovering device at " - "location=%s driver=%s pci addr=%s\n", + eeh_edev_err(edev, "%d reads ignored for recovering device at location=%s driver=%s\n", pe->check_count, location ? location : "unknown", - eeh_driver_name(dev), eeh_pci_name(dev)); - printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", + eeh_driver_name(dev)); + eeh_edev_err(edev, "Might be infinite loop in %s driver\n", eeh_driver_name(dev)); dump_stack(); } @@ -1188,12 +1186,11 @@ void eeh_add_device_late(struct pci_dev *dev) if (!dev) return; - pr_debug("EEH: Adding device %s\n", pci_name(dev)); - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); edev = pdn_to_eeh_dev(pdn); + eeh_edev_dbg(edev, "Adding device\n"); if (edev->pdev == dev) { - pr_debug("EEH: Device %s already referenced!\n", pci_name(dev)); + eeh_edev_dbg(edev, "Device already referenced!\n"); return; } @@ -1296,10 +1293,10 @@ void eeh_remove_device(struct pci_dev *dev) edev = pci_dev_to_eeh_dev(dev); /* Unregister the device with the EEH/PCI address search system */ - pr_debug("EEH: Removing device %s\n", pci_name(dev)); + dev_dbg(&dev->dev, "EEH: Removing device\n"); if (!edev || !edev->pdev || !edev->pe) { - pr_debug("EEH: Not referenced !\n"); + dev_dbg(&dev->dev, "EEH: Device not referenced!\n"); return; } diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 1e47cd04a1bd..cf11277ebd02 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -148,8 +148,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, piar->pcidev = dev; piar->flags = flags; - pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", - &alo, &ahi, pci_name(dev)); + eeh_edev_dbg(piar->edev, "PIAR: insert range=[%pap:%pap]\n", + &alo, &ahi); rb_link_node(&piar->rb_node, parent, p); rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); @@ -229,8 +229,8 @@ restart: piar = rb_entry(n, struct pci_io_addr_range, rb_node); if (piar->pcidev == dev) { - pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n", - &piar->addr_lo, &piar->addr_hi, pci_name(dev)); + eeh_edev_dbg(piar->edev, "PIAR: remove range=[%pap:%pap]\n", + &piar->addr_lo, &piar->addr_hi); rb_erase(n, &pci_io_addr_cache_root.rb_root); kfree(piar); goto restart; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index ab576bcbe4dd..274075a814b6 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -456,12 +456,9 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) { struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); - struct pci_dn *pdn = eeh_dev_to_pdn(edev); if (!(edev->physfn)) { - pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", - __func__, pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + eeh_edev_warn(edev, "Not for VF\n"); return NULL; } @@ -475,7 +472,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) } #ifdef CONFIG_PCI_IOV - pci_iov_add_virtfn(edev->physfn, pdn->vf_index); + pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); #endif return NULL; } diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 854cef7b18f4..317a31624526 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -379,8 +379,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { - pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n", - __func__, config_addr, pdn->phb->global_number); + eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n"); return -EINVAL; } @@ -398,12 +397,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Put the edev to PE */ list_add_tail(&edev->entry, &pe->edevs); - pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr); + eeh_edev_dbg(edev, "Added to bus PE\n"); return 0; } else if (pe && (pe->type & EEH_PE_INVALID)) { list_add_tail(&edev->entry, &pe->edevs); @@ -420,13 +414,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) parent = parent->parent; } - pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device " - "PE#%x, Parent PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr, pe->parent->addr); + eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n", + pe->parent->addr); return 0; } @@ -468,13 +457,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) list_add_tail(&pe->child, &parent->child_list); list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; - pr_debug("EEH: Add %04x:%02x:%02x.%01x to " - "Device PE#%x, Parent PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr, pe->parent->addr); + eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n", + pe->parent->addr); return 0; } @@ -492,15 +476,10 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; int cnt; - struct pci_dn *pdn = eeh_dev_to_pdn(edev); pe = eeh_dev_to_pe(edev); if (!pe) { - pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", - __func__, pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn)); + eeh_edev_dbg(edev, "No PE found for device.\n"); return -EEXIST; } @@ -717,17 +696,13 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT))) return; - pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", - __func__, pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn)); + eeh_edev_dbg(edev, "Checking PCIe link...\n"); /* Check slot status */ cap = edev->pcie_cap; eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val); if (!(val & PCI_EXP_SLTSTA_PDS)) { - pr_debug(" No card in the slot (0x%04x) !\n", val); + eeh_edev_dbg(edev, "No card in the slot (0x%04x) !\n", val); return; } @@ -736,7 +711,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) if (val & PCI_EXP_SLTCAP_PCP) { eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val); if (val & PCI_EXP_SLTCTL_PCC) { - pr_debug(" In power-off state, power it on ...\n"); + eeh_edev_dbg(edev, "In power-off state, power it on ...\n"); val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); val |= (0x0100 & PCI_EXP_SLTCTL_PIC); eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val); @@ -752,7 +727,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) /* Check link */ eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val); if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { - pr_debug(" No link reporting capability (0x%08x) \n", val); + eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", val); msleep(1000); return; } @@ -769,10 +744,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) } if (val & PCI_EXP_LNKSTA_DLLLA) - pr_debug(" Link up (%s)\n", + eeh_edev_dbg(edev, "Link up (%s)\n", (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); else - pr_debug(" Link not ready (0x%04x)\n", val); + eeh_edev_dbg(edev, "Link not ready (0x%04x)\n", val); } #define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 6ed9b6ccafcc..e7b867912f24 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -44,7 +44,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev)); + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); eeh_add_device_early(pdn); eeh_add_device_late(pdev); eeh_sysfs_add_device(pdev); @@ -370,10 +370,6 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) int ret; int config_addr = (pdn->busno << 8) | (pdn->devfn); - pr_debug("%s: probing %04x:%02x:%02x.%01x\n", - __func__, hose->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); - /* * When probing the root bridge, which doesn't have any * subordinate PCI devices. We don't have OF node for @@ -387,6 +383,8 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -412,9 +410,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) /* Create PE */ ret = eeh_add_to_parent_pe(edev); if (ret) { - pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n", - __func__, hose->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret); + eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret); return NULL; } @@ -472,10 +468,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) /* Save memory bars */ eeh_save_bars(edev); - pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n", - __func__, pdn->busno, PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), edev->pe->phb->global_number, - edev->pe->addr); + eeh_edev_dbg(edev, "EEH enabled on device\n"); return NULL; } diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 02454c737d11..893ba3f562c4 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -49,7 +49,7 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) if (eeh_has_flag(EEH_FORCE_DISABLED)) return; - pr_debug("%s: EEH: Setting up device %s.\n", __func__, pci_name(pdev)); + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); #ifdef CONFIG_PCI_IOV if (pdev->is_virtfn) { struct pci_dn *physfn_pdn; @@ -238,10 +238,6 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) int enable = 0; int ret; - pr_debug("%s: probing %04x:%02x:%02x.%01x\n", - __func__, pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); - /* Retrieve OF node and eeh device */ edev = pdn_to_eeh_dev(pdn); if (!edev || edev->pe) @@ -255,6 +251,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* * Update class code and mode of eeh device. We need * correctly reflects that current device is root port @@ -284,12 +282,10 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) pe.config_addr = (pdn->busno << 16) | (pdn->devfn << 8); /* Enable EEH on the device */ + eeh_edev_dbg(edev, "Enabling EEH on device\n"); ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE); if (ret) { - pr_debug("%s: EEH failed to enable on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n", - __func__, pdn->busno, PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), pe.phb->global_number, - pe.addr, ret); + eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret); } else { /* Retrieve PE address */ edev->pe_config_addr = eeh_ops->get_pe_addr(&pe); @@ -314,11 +310,8 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr; eeh_add_to_parent_pe(edev); } - pr_debug("%s: EEH %s on %02x:%02x.%01x PHB#%x-PE#%x (code %d)\n", - __func__, (enable ? "enabled" : "unsupported"), - pdn->busno, PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), pe.phb->global_number, - pe.addr, ret); + eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n", + (enable ? "enabled" : "unsupported"), ret); } /* Save memory bars */ -- cgit v1.2.3 From 2e25505147b8acf6510b9d5d951fd4c75f2e9bf2 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:14 +1000 Subject: powerpc/eeh: Fix crash when edev->pdev changes If a PCI device is removed during eeh_pe_report_edev(), between the calls to device_lock() and device_unlock(), edev->pdev will change and cause a crash as the wrong mutex is released. To correct this, hold the PCI rescan/remove lock while taking a copy of edev->pdev and performing a get_device() on it. Use this value to release the mutex, but also pass it through to the device driver's EEH handlers so that they always see the same device. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3c590579a0faa24d20c826dcd26c739eb4d454e6.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/kernel/eeh_driver.c | 44 ++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 274075a814b6..e817d78fe52d 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -257,20 +257,27 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable) } typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, + struct pci_dev *, struct pci_driver *); static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, enum pci_ers_result *result) { + struct pci_dev *pdev; struct pci_driver *driver; enum pci_ers_result new_result; - if (!edev->pdev) { + pci_lock_rescan_remove(); + pdev = edev->pdev; + if (pdev) + get_device(&pdev->dev); + pci_unlock_rescan_remove(); + if (!pdev) { eeh_edev_info(edev, "no device"); return; } - device_lock(&edev->pdev->dev); + device_lock(&pdev->dev); if (eeh_edev_actionable(edev)) { - driver = eeh_pcid_get(edev->pdev); + driver = eeh_pcid_get(pdev); if (!driver) eeh_edev_info(edev, "no driver"); @@ -279,7 +286,7 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, else if (edev->mode & EEH_DEV_NO_HANDLER) eeh_edev_info(edev, "driver bound too late"); else { - new_result = fn(edev, driver); + new_result = fn(edev, pdev, driver); eeh_edev_info(edev, "%s driver reports: '%s'", driver->name, pci_ers_result_name(new_result)); @@ -288,12 +295,15 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, new_result); } if (driver) - eeh_pcid_put(edev->pdev); + eeh_pcid_put(pdev); } else { - eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, + eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); } - device_unlock(&edev->pdev->dev); + device_unlock(&pdev->dev); + if (edev->pdev != pdev) + eeh_edev_warn(edev, "Device changed during processing!\n"); + put_device(&pdev->dev); } static void eeh_pe_report(const char *name, struct eeh_pe *root, @@ -320,20 +330,20 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root, * Report an EEH error to each device driver. */ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { enum pci_ers_result rc; - struct pci_dev *dev = edev->pdev; if (!driver->err_handler->error_detected) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", driver->name); - rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); + rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); edev->in_error = true; - pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); + pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); return rc; } @@ -346,12 +356,13 @@ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, * are now enabled. */ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->mmio_enabled) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); - return driver->err_handler->mmio_enabled(edev->pdev); + return driver->err_handler->mmio_enabled(pdev); } /** @@ -365,12 +376,13 @@ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, * driver can work again while the device is recovered. */ static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->slot_reset || !edev->in_error) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); - return driver->err_handler->slot_reset(edev->pdev); + return driver->err_handler->slot_reset(pdev); } static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) @@ -411,13 +423,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) * to make the recovered device work again. */ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->resume || !edev->in_error) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->resume()", driver->name); - driver->err_handler->resume(edev->pdev); + driver->err_handler->resume(pdev); pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); #ifdef CONFIG_PCI_IOV @@ -436,6 +449,7 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, * dead, and that no further recovery attempts will be made on it. */ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { enum pci_ers_result rc; @@ -445,10 +459,10 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", driver->name); - rc = driver->err_handler->error_detected(edev->pdev, + rc = driver->err_handler->error_detected(pdev, pci_channel_io_perm_failure); - pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); return rc; } -- cgit v1.2.3 From cef50c67c1d511bbbc974cead2bebeb6f83730ce Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:15 +1000 Subject: powerpc/eeh: Remove unused return path from eeh_pe_dev_traverse() There are no users of the early-out return value from eeh_pe_dev_traverse(), so remove it. Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c648070f5b28fe8ca1880b48e64b267959ffd369.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/include/asm/eeh.h | 6 +++--- arch/powerpc/kernel/eeh.c | 16 +++++----------- arch/powerpc/kernel/eeh_driver.c | 26 +++++++++++--------------- arch/powerpc/kernel/eeh_pe.c | 25 +++++++------------------ 4 files changed, 26 insertions(+), 47 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 56c1b4b0987a..c13119a5e69b 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -260,7 +260,7 @@ static inline bool eeh_state_active(int state) == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); } -typedef void *(*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); +typedef void (*eeh_edev_traverse_func)(struct eeh_dev *edev, void *flag); typedef void *(*eeh_pe_traverse_func)(struct eeh_pe *pe, void *flag); void eeh_set_pe_aux_size(int size); int eeh_phb_pe_create(struct pci_controller *phb); @@ -274,8 +274,8 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev); void eeh_pe_update_time_stamp(struct eeh_pe *pe); void *eeh_pe_traverse(struct eeh_pe *root, eeh_pe_traverse_func fn, void *flag); -void *eeh_pe_dev_traverse(struct eeh_pe *root, - eeh_edev_traverse_func fn, void *flag); +void eeh_pe_dev_traverse(struct eeh_pe *root, + eeh_edev_traverse_func fn, void *flag); void eeh_pe_restore_bars(struct eeh_pe *pe); const char *eeh_pe_loc_get(struct eeh_pe *pe); struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 958e03ca1db6..7b2755f5c6fd 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -705,7 +705,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function) return rc; } -static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, +static void eeh_disable_and_save_dev_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); @@ -716,7 +716,7 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, * state for the specified device */ if (!pdev || pdev == dev) - return NULL; + return; /* Ensure we have D0 power state */ pci_set_power_state(pdev, PCI_D0); @@ -729,18 +729,16 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, * interrupt from the device */ pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); - - return NULL; } -static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) +static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) { struct pci_dn *pdn = eeh_dev_to_pdn(edev); struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); struct pci_dev *dev = userdata; if (!pdev) - return NULL; + return; /* Apply customization from firmware */ if (pdn && eeh_ops->restore_config) @@ -749,8 +747,6 @@ static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) /* The caller should restore state for the specified device */ if (pdev != dev) pci_restore_state(pdev); - - return NULL; } int eeh_restore_vf_config(struct pci_dn *pdn) @@ -876,7 +872,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat * the indicated device and its children so that the bunch of the * devices could be reset properly. */ -static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) +static void eeh_set_dev_freset(struct eeh_dev *edev, void *flag) { struct pci_dev *dev; unsigned int *freset = (unsigned int *)flag; @@ -884,8 +880,6 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) dev = eeh_dev_to_pci_dev(edev); if (dev) *freset |= dev->needs_freset; - - return NULL; } static void eeh_pe_refreeze_passed(struct eeh_pe *root) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index e817d78fe52d..a31cd32c4ce9 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -197,12 +197,12 @@ static void eeh_enable_irq(struct eeh_dev *edev) } } -static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) +static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev; if (!edev) - return NULL; + return; /* * We cannot access the config space on some adapters. @@ -212,14 +212,13 @@ static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) * device is created. */ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) - return NULL; + return; pdev = eeh_dev_to_pci_dev(edev); if (!pdev) - return NULL; + return; pci_save_state(pdev); - return NULL; } static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) @@ -385,12 +384,12 @@ static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, return driver->err_handler->slot_reset(pdev); } -static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) +static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev; if (!edev) - return NULL; + return; /* * The content in the config space isn't saved because @@ -402,15 +401,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); - return NULL; + return; } pdev = eeh_dev_to_pci_dev(edev); if (!pdev) - return NULL; + return; pci_restore_state(pdev); - return NULL; } /** @@ -491,7 +489,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) return NULL; } -static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) +static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) { struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); @@ -506,7 +504,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) */ if (!eeh_edev_actionable(edev) || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) - return NULL; + return; if (rmv_data) { driver = eeh_pcid_get(dev); @@ -515,7 +513,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) driver->err_handler->error_detected && driver->err_handler->slot_reset) { eeh_pcid_put(dev); - return NULL; + return; } eeh_pcid_put(dev); } @@ -548,8 +546,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pci_stop_and_remove_bus_device(dev); pci_unlock_rescan_remove(); } - - return NULL; } static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 317a31624526..9c38fa7c33aa 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -231,29 +231,22 @@ void *eeh_pe_traverse(struct eeh_pe *root, * The function is used to traverse the devices of the specified * PE and its child PEs. */ -void *eeh_pe_dev_traverse(struct eeh_pe *root, +void eeh_pe_dev_traverse(struct eeh_pe *root, eeh_edev_traverse_func fn, void *flag) { struct eeh_pe *pe; struct eeh_dev *edev, *tmp; - void *ret; if (!root) { pr_warn("%s: Invalid PE %p\n", __func__, root); - return NULL; + return; } /* Traverse root PE */ - eeh_for_each_pe(root, pe) { - eeh_pe_for_each_dev(pe, edev, tmp) { - ret = fn(edev, flag); - if (ret) - return ret; - } - } - - return NULL; + eeh_for_each_pe(root, pe) + eeh_pe_for_each_dev(pe, edev, tmp) + fn(edev, flag); } /** @@ -602,13 +595,11 @@ void eeh_pe_mark_isolated(struct eeh_pe *root) } EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated); -static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) +static void __eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) { int mode = *((int *)flag); edev->mode |= mode; - - return NULL; } /** @@ -827,7 +818,7 @@ static void eeh_restore_device_bars(struct eeh_dev *edev) * the expansion ROM base address, the latency timer, and etc. * from the saved values in the device node. */ -static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) +static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) { struct pci_dn *pdn = eeh_dev_to_pdn(edev); @@ -839,8 +830,6 @@ static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) if (eeh_ops->restore_config && pdn) eeh_ops->restore_config(pdn); - - return NULL; } /** -- cgit v1.2.3 From 27d4396ed5a1c70f894557fd5732725eb1d94542 Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 16 Aug 2019 14:48:16 +1000 Subject: powerpc/eeh: Slightly simplify eeh_add_to_parent_pe() Simplify some needlessly complicated boolean logic in eeh_add_to_parent_pe(). Signed-off-by: Sam Bobroff Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/09259a50308f10aa764695912bc87dc1d1cf654c.1565930772.git.sbobroff@linux.ibm.com --- arch/powerpc/kernel/eeh_pe.c | 52 +++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 9c38fa7c33aa..1a6254bcf056 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -383,32 +383,34 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * components. */ pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); - if (pe && !(pe->type & EEH_PE_INVALID)) { - /* Mark the PE as type of PCI bus */ - pe->type = EEH_PE_BUS; - edev->pe = pe; - - /* Put the edev to PE */ - list_add_tail(&edev->entry, &pe->edevs); - eeh_edev_dbg(edev, "Added to bus PE\n"); - return 0; - } else if (pe && (pe->type & EEH_PE_INVALID)) { - list_add_tail(&edev->entry, &pe->edevs); - edev->pe = pe; - /* - * We're running to here because of PCI hotplug caused by - * EEH recovery. We need clear EEH_PE_INVALID until the top. - */ - parent = pe; - while (parent) { - if (!(parent->type & EEH_PE_INVALID)) - break; - parent->type &= ~EEH_PE_INVALID; - parent = parent->parent; - } + if (pe) { + if (pe->type & EEH_PE_INVALID) { + list_add_tail(&edev->entry, &pe->edevs); + edev->pe = pe; + /* + * We're running to here because of PCI hotplug caused by + * EEH recovery. We need clear EEH_PE_INVALID until the top. + */ + parent = pe; + while (parent) { + if (!(parent->type & EEH_PE_INVALID)) + break; + parent->type &= ~EEH_PE_INVALID; + parent = parent->parent; + } - eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n", - pe->parent->addr); + eeh_edev_dbg(edev, + "Added to device PE (parent: PE#%x)\n", + pe->parent->addr); + } else { + /* Mark the PE as type of PCI bus */ + pe->type = EEH_PE_BUS; + edev->pe = pe; + + /* Put the edev to PE */ + list_add_tail(&edev->entry, &pe->edevs); + eeh_edev_dbg(edev, "Added to bus PE\n"); + } return 0; } -- cgit v1.2.3 From 6278f55ba5ed20f486ea2049da0cf9a9c1d9a5d5 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 14 Aug 2019 15:56:37 -0500 Subject: powerpc: Document xmon options Document all options currently supported by xmon debugger. Signed-off-by: Gustavo Romero Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190814205638.25322-1-gromero@linux.ibm.com --- Documentation/admin-guide/kernel-parameters.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7ccd158b3894..6d495aab4d0b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5305,3 +5305,22 @@ A hex value specifying bitmask with supplemental xhci host controller quirks. Meaning of each bit can be consulted in header drivers/usb/host/xhci.h. + + xmon [PPC] + Format: { early | on | rw | ro | off } + Controls if xmon debugger is enabled. Default is off. + Passing only "xmon" is equivalent to "xmon=early". + early Call xmon as early as possible on boot; xmon + debugger is called from setup_arch(). + on xmon debugger hooks will be installed so xmon + is only called on a kernel crash. Default mode, + i.e. either "ro" or "rw" mode, is controlled + with CONFIG_XMON_DEFAULT_RO_MODE. + rw xmon debugger hooks will be installed so xmon + is called only on a kernel crash, mode is write, + meaning SPR registers, memory and, other data + can be written using xmon commands. + ro same as "rw" option above but SPR registers, + memory, and other data can't be written using + xmon commands. + off xmon is disabled. -- cgit v1.2.3 From b8baa05a0e505f8311fb03f5c72e75fa0e13fd95 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 14 Aug 2019 15:56:38 -0500 Subject: selftests/powerpc: Ignore generated files Currently some binary files which are generated when tests are compiled are not ignored by git, so 'git status' catch them. For copyloops test, fix wrong binary names already in .gitignore. For ptrace, security, and stringloops tests add missing binary names to the .gitignore file. Signed-off-by: Gustavo Romero Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190814205638.25322-2-gromero@linux.ibm.com --- tools/testing/selftests/powerpc/copyloops/.gitignore | 8 ++++---- tools/testing/selftests/powerpc/ptrace/.gitignore | 3 +++ tools/testing/selftests/powerpc/security/.gitignore | 1 + tools/testing/selftests/powerpc/stringloops/.gitignore | 5 ++++- 4 files changed, 12 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/powerpc/security/.gitignore diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore index ce12cd0e2967..de158104912a 100644 --- a/tools/testing/selftests/powerpc/copyloops/.gitignore +++ b/tools/testing/selftests/powerpc/copyloops/.gitignore @@ -1,13 +1,13 @@ copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 -copyuser_power7_t0 -copyuser_power7_t1 +copyuser_p7_t0 +copyuser_p7_t1 memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 -memcpy_power7_t0 -memcpy_power7_t1 +memcpy_p7_t0 +memcpy_p7_t1 copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2 diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore index 07ec449a2767..dce19f221c46 100644 --- a/tools/testing/selftests/powerpc/ptrace/.gitignore +++ b/tools/testing/selftests/powerpc/ptrace/.gitignore @@ -10,3 +10,6 @@ ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak perf-hwbreak +core-pkey +ptrace-pkey +ptrace-syscall diff --git a/tools/testing/selftests/powerpc/security/.gitignore b/tools/testing/selftests/powerpc/security/.gitignore new file mode 100644 index 000000000000..0b969fba3beb --- /dev/null +++ b/tools/testing/selftests/powerpc/security/.gitignore @@ -0,0 +1 @@ +rfi_flush diff --git a/tools/testing/selftests/powerpc/stringloops/.gitignore b/tools/testing/selftests/powerpc/stringloops/.gitignore index 0b43da74ee46..31a17e0ba884 100644 --- a/tools/testing/selftests/powerpc/stringloops/.gitignore +++ b/tools/testing/selftests/powerpc/stringloops/.gitignore @@ -1 +1,4 @@ -memcmp +memcmp_64 +memcmp_32 +strlen +strlen_32 -- cgit v1.2.3 From 7e04a46d84f73ef0f21d2257d6ba2a194c0f1511 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 27 Jun 2019 15:30:08 +1000 Subject: powerpc/configs: Disable /dev/port in skiroot defconfig While reviewing lockdown patches, I discovered that we still enable /dev/port (CONFIG_DEVPORT) in skiroot. /dev/port is used for old x86 style IO accesses. It's set up in drivers/char/mem.c, and is only created if arch_has_dev_port() returns true. Per arch/powerpc/include/asm/io.h, on PPC64 with PCI, this is only true if there's a legacy ISA bridge. Even if a system has a legacy ISA bridge installed, we have no business accessing it in skiroot. Deselect CONFIG_DEVPORT for skiroot. Signed-off-by: Daniel Axtens [mpe: Incorporate emailed comments into the change log] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190627053008.29315-1-dja@axtens.net --- arch/powerpc/configs/skiroot_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 557b530b2f70..1253482a67c0 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -213,6 +213,7 @@ CONFIG_IPMI_WATCHDOG=y CONFIG_HW_RANDOM=y CONFIG_TCG_TPM=y CONFIG_TCG_TIS_I2C_NUVOTON=y +# CONFIG_DEVPORT is not set CONFIG_I2C=y # CONFIG_I2C_COMPAT is not set CONFIG_I2C_CHARDEV=y -- cgit v1.2.3 From 9d535e200f09ce347afc38c81ec7f2901187e5f0 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Wed, 14 Aug 2019 15:52:11 -0500 Subject: selftests/powerpc: Fix and enhance TM signal context tests Currently TM signal context tests for GPR, FPR, VMX, and VSX registers print wrong register numbers (wrongly starting from register 0 instead of the first register in the non-volatile subset). Besides it the output when a mismatch happens is poor giving not much information about which context and which register mismatches, because it prints both contexts at the same time and not a comparison between the value that mismatches and the value expected and, moreover, it stops printing on the first mismatch, but it's important to know if there are other mismatches happening beyond the first one. For instance, this is the current output when a mismatch happens: test: tm_signal_context_chk_gpr tags: git_version:v5.2-8249-g02e970fae465-dirty Failed on 0 GPR 1 or 18446744073709551615 failure: tm_signal_context_chk_gpr test: tm_signal_context_chk_fpu tags: git_version:v5.2-8248-g09c289e3ef80 Failed on 0 FP -1 or -1 failure: tm_signal_context_chk_fpu test: tm_signal_context_chk_vmx tags: git_version:v5.2-8248-g09c289e3ef80 Failed on 0 vmx 0xfffffffffffffffefffffffdfffffffc vs 0xfffffffffffffffefffffffdfffffffc failure: tm_signal_context_chk_vmx test: tm_signal_context_chk_vsx tags: git_version:v5.2-8248-g09c289e3ef80 Failed on 0 vsx 0xfffffffffefffffffdfffffffcffffff vs 0xfffffffffefffffffdfffffffcffffff failure: tm_signal_context_chk_vsx This commit fixes the register numbers printed and enhances the error output by providing a full list of mismatching registers separated by the context (non-speculative or speculative context), for example: test: tm_signal_context_chk_gpr tags: git_version:v5.2-8249-g02e970fae465-dirty GPR14 (1st context) == 1 instead of -1 (expected) GPR15 (1st context) == 2 instead of -2 (expected) GPR14 (2nd context) == 0 instead of 18446744073709551615 (expected) GPR15 (2nd context) == 0 instead of 18446744073709551614 (expected) failure: tm_signal_context_chk_gpr test: tm_signal_context_chk_fpu tags: git_version:v5.2-8249-g02e970fae465-dirty FPR14 (1st context) == -1 instead of 1 (expected) FPR15 (1st context) == -2 instead of 2 (expected) failure: tm_signal_context_chk_fpu test: tm_signal_context_chk_vmx tags: git_version:v5.2-8249-g02e970fae465-dirty VMX20 (1st context) == 0xfffffffffffffffefffffffdfffffffc instead of 0x00000001000000020000000300000004 (expected) VMX21 (1st context) == 0xfffffffbfffffffafffffff9fffffff8 instead of 0x00000005000000060000000700000008 (expected) failure: tm_signal_context_chk_vmx test: tm_signal_context_chk_vsx tags: git_version:v5.2-8249-g02e970fae465-dirty VSX20 (1st context) == 0xfffffffffefffffffdfffffffcffffff instead of 0x00000001000000020000000300000004 (expected) VSX21 (1st context) == 0xfbfffffffafffffff9fffffff8ffffff instead of 0x00000005000000060000000700000008 (expected) failure: tm_signal_context_chk_vsx Finally, this commit adds comments to the tests in the hope that it will help people not so familiar with TM understand the tests. Signed-off-by: Gustavo Romero Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190814205211.24840-1-gromero@linux.ibm.com --- .../powerpc/tm/tm-signal-context-chk-fpu.c | 49 +++++--- .../powerpc/tm/tm-signal-context-chk-gpr.c | 59 +++++++--- .../powerpc/tm/tm-signal-context-chk-vmx.c | 74 ++++++++---- .../powerpc/tm/tm-signal-context-chk-vsx.c | 130 +++++++++++++++------ 4 files changed, 228 insertions(+), 84 deletions(-) diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c index d57c2d2ab6ec..254f912ad611 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c @@ -5,10 +5,11 @@ * Test the kernel's signal frame code. * * The kernel sets up two sets of ucontexts if the signal was to be - * delivered while the thread was in a transaction. + * delivered while the thread was in a transaction (referred too as + * first and second contexts). * Expected behaviour is that the checkpointed state is in the user - * context passed to the signal handler. The speculated state can be - * accessed with the uc_link pointer. + * context passed to the signal handler (first context). The speculated + * state can be accessed with the uc_link pointer (second context). * * The rationale for this is that if TM unaware code (which linked * against TM libs) installs a signal handler it will not know of the @@ -28,17 +29,20 @@ #define MAX_ATTEMPT 500000 -#define NV_FPU_REGS 18 +#define NV_FPU_REGS 18 /* Number of non-volatile FP registers */ +#define FPR14 14 /* First non-volatile FP register to check in f14-31 subset */ long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); -/* Be sure there are 2x as many as there are NV FPU regs (2x18) */ +/* Test only non-volatile registers, i.e. 18 fpr registers from f14 to f31 */ static double fps[] = { + /* First context will be set with these values, i.e. non-speculative */ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + /* Second context will be set with these values, i.e. speculative */ -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18 }; -static sig_atomic_t fail; +static sig_atomic_t fail, broken; static void signal_usr1(int signum, siginfo_t *info, void *uc) { @@ -46,11 +50,24 @@ static void signal_usr1(int signum, siginfo_t *info, void *uc) ucontext_t *ucp = uc; ucontext_t *tm_ucp = ucp->uc_link; - for (i = 0; i < NV_FPU_REGS && !fail; i++) { - fail = (ucp->uc_mcontext.fp_regs[i + 14] != fps[i]); - fail |= (tm_ucp->uc_mcontext.fp_regs[i + 14] != fps[i + NV_FPU_REGS]); - if (fail) - printf("Failed on %d FP %g or %g\n", i, ucp->uc_mcontext.fp_regs[i + 14], tm_ucp->uc_mcontext.fp_regs[i + 14]); + for (i = 0; i < NV_FPU_REGS; i++) { + /* Check first context. Print all mismatches. */ + fail = (ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[i]); + if (fail) { + broken = 1; + printf("FPR%d (1st context) == %g instead of %g (expected)\n", + FPR14 + i, ucp->uc_mcontext.fp_regs[FPR14 + i], fps[i]); + } + } + + for (i = 0; i < NV_FPU_REGS; i++) { + /* Check second context. Print all mismatches. */ + fail = (tm_ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[NV_FPU_REGS + i]); + if (fail) { + broken = 1; + printf("FPR%d (2nd context) == %g instead of %g (expected)\n", + FPR14 + i, tm_ucp->uc_mcontext.fp_regs[FPR14 + i], fps[NV_FPU_REGS + i]); + } } } @@ -72,13 +89,19 @@ static int tm_signal_context_chk_fpu() } i = 0; - while (i < MAX_ATTEMPT && !fail) { + while (i < MAX_ATTEMPT && !broken) { + /* + * tm_signal_self_context_load will set both first and second + * contexts accordingly to the values passed through non-NULL + * array pointers to it, in that case 'fps', and invoke the + * signal handler installed for SIGUSR1. + */ rc = tm_signal_self_context_load(pid, NULL, fps, NULL, NULL); FAIL_IF(rc != pid); i++; } - return fail; + return (broken); } int main(void) diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c index 4d05f8b0254c..0cc680f61828 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c @@ -5,10 +5,11 @@ * Test the kernel's signal frame code. * * The kernel sets up two sets of ucontexts if the signal was to be - * delivered while the thread was in a transaction. + * delivered while the thread was in a transaction (referred too as + * first and second contexts). * Expected behaviour is that the checkpointed state is in the user - * context passed to the signal handler. The speculated state can be - * accessed with the uc_link pointer. + * context passed to the signal handler (first context). The speculated + * state can be accessed with the uc_link pointer (second context). * * The rationale for this is that if TM unaware code (which linked * against TM libs) installs a signal handler it will not know of the @@ -28,14 +29,22 @@ #define MAX_ATTEMPT 500000 -#define NV_GPR_REGS 18 +#define NV_GPR_REGS 18 /* Number of non-volatile GPR registers */ +#define R14 14 /* First non-volatile register to check in r14-r31 subset */ long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); -static sig_atomic_t fail; +static sig_atomic_t fail, broken; -static long gps[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18}; +/* Test only non-volatile general purpose registers, i.e. r14-r31 */ +static long gprs[] = { + /* First context will be set with these values, i.e. non-speculative */ + /* R14, R15, ... */ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + /* Second context will be set with these values, i.e. speculative */ + /* R14, R15, ... */ + -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18 +}; static void signal_usr1(int signum, siginfo_t *info, void *uc) { @@ -43,12 +52,24 @@ static void signal_usr1(int signum, siginfo_t *info, void *uc) ucontext_t *ucp = uc; ucontext_t *tm_ucp = ucp->uc_link; - for (i = 0; i < NV_GPR_REGS && !fail; i++) { - fail = (ucp->uc_mcontext.gp_regs[i + 14] != gps[i]); - fail |= (tm_ucp->uc_mcontext.gp_regs[i + 14] != gps[i + NV_GPR_REGS]); - if (fail) - printf("Failed on %d GPR %lu or %lu\n", i, - ucp->uc_mcontext.gp_regs[i + 14], tm_ucp->uc_mcontext.gp_regs[i + 14]); + /* Check first context. Print all mismatches. */ + for (i = 0; i < NV_GPR_REGS; i++) { + fail = (ucp->uc_mcontext.gp_regs[R14 + i] != gprs[i]); + if (fail) { + broken = 1; + printf("GPR%d (1st context) == %lu instead of %lu (expected)\n", + R14 + i, ucp->uc_mcontext.gp_regs[R14 + i], gprs[i]); + } + } + + /* Check second context. Print all mismatches. */ + for (i = 0; i < NV_GPR_REGS; i++) { + fail = (tm_ucp->uc_mcontext.gp_regs[R14 + i] != gprs[NV_GPR_REGS + i]); + if (fail) { + broken = 1; + printf("GPR%d (2nd context) == %lu instead of %lu (expected)\n", + R14 + i, tm_ucp->uc_mcontext.gp_regs[R14 + i], gprs[NV_GPR_REGS + i]); + } } } @@ -70,13 +91,19 @@ static int tm_signal_context_chk_gpr() } i = 0; - while (i < MAX_ATTEMPT && !fail) { - rc = tm_signal_self_context_load(pid, gps, NULL, NULL, NULL); + while (i < MAX_ATTEMPT && !broken) { + /* + * tm_signal_self_context_load will set both first and second + * contexts accordingly to the values passed through non-NULL + * array pointers to it, in that case 'gprs', and invoke the + * signal handler installed for SIGUSR1. + */ + rc = tm_signal_self_context_load(pid, gprs, NULL, NULL, NULL); FAIL_IF(rc != pid); i++; } - return fail; + return broken; } int main(void) diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c index 48ad01499b1a..b6d52730a0d8 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c @@ -5,10 +5,11 @@ * Test the kernel's signal frame code. * * The kernel sets up two sets of ucontexts if the signal was to be - * delivered while the thread was in a transaction. + * delivered while the thread was in a transaction (referred too as + * first and second contexts). * Expected behaviour is that the checkpointed state is in the user - * context passed to the signal handler. The speculated state can be - * accessed with the uc_link pointer. + * context passed to the signal handler (first context). The speculated + * state can be accessed with the uc_link pointer (second context). * * The rationale for this is that if TM unaware code (which linked * against TM libs) installs a signal handler it will not know of the @@ -29,18 +30,24 @@ #define MAX_ATTEMPT 500000 -#define NV_VMX_REGS 12 +#define NV_VMX_REGS 12 /* Number of non-volatile VMX registers */ +#define VMX20 20 /* First non-volatile register to check in vr20-31 subset */ long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); -static sig_atomic_t fail; +static sig_atomic_t fail, broken; +/* Test only non-volatile registers, i.e. 12 vmx registers from vr20 to vr31 */ vector int vms[] = { - {1, 2, 3, 4 },{5, 6, 7, 8 },{9, 10,11,12}, + /* First context will be set with these values, i.e. non-speculative */ + /* VMX20 , VMX21 , ... */ + { 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12}, {13,14,15,16},{17,18,19,20},{21,22,23,24}, {25,26,27,28},{29,30,31,32},{33,34,35,36}, {37,38,39,40},{41,42,43,44},{45,46,47,48}, - {-1, -2, -3, -4}, {-5, -6, -7, -8}, {-9, -10,-11,-12}, + /* Second context will be set with these values, i.e. speculative */ + /* VMX20 , VMX21 , ... */ + { -1, -2, -3, -4},{ -5, -6, -7, -8},{ -9,-10,-11,-12}, {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24}, {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36}, {-37,-38,-39,-40},{-41,-42,-43,-44},{-45,-46,-47,-48} @@ -48,26 +55,43 @@ vector int vms[] = { static void signal_usr1(int signum, siginfo_t *info, void *uc) { - int i; + int i, j; ucontext_t *ucp = uc; ucontext_t *tm_ucp = ucp->uc_link; - for (i = 0; i < NV_VMX_REGS && !fail; i++) { - fail = memcmp(ucp->uc_mcontext.v_regs->vrregs[i + 20], + for (i = 0; i < NV_VMX_REGS; i++) { + /* Check first context. Print all mismatches. */ + fail = memcmp(ucp->uc_mcontext.v_regs->vrregs[VMX20 + i], &vms[i], sizeof(vector int)); - fail |= memcmp(tm_ucp->uc_mcontext.v_regs->vrregs[i + 20], - &vms[i + NV_VMX_REGS], sizeof (vector int)); - if (fail) { - int j; + broken = 1; + printf("VMX%d (1st context) == 0x", VMX20 + i); + /* Print actual value in first context. */ + for (j = 0; j < 4; j++) + printf("%08x", ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]); + printf(" instead of 0x"); + /* Print expected value. */ + for (j = 0; j < 4; j++) + printf("%08x", vms[i][j]); + printf(" (expected)\n"); + } + } - fprintf(stderr, "Failed on %d vmx 0x", i); + for (i = 0; i < NV_VMX_REGS; i++) { + /* Check second context. Print all mismatches. */ + fail = memcmp(tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i], + &vms[NV_VMX_REGS + i], sizeof (vector int)); + if (fail) { + broken = 1; + printf("VMX%d (2nd context) == 0x", NV_VMX_REGS + i); + /* Print actual value in second context. */ + for (j = 0; j < 4; j++) + printf("%08x", tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]); + printf(" instead of 0x"); + /* Print expected value. */ for (j = 0; j < 4; j++) - fprintf(stderr, "%04x", ucp->uc_mcontext.v_regs->vrregs[i + 20][j]); - fprintf(stderr, " vs 0x"); - for (j = 0 ; j < 4; j++) - fprintf(stderr, "%04x", tm_ucp->uc_mcontext.v_regs->vrregs[i + 20][j]); - fprintf(stderr, "\n"); + printf("%08x", vms[NV_VMX_REGS + i][j]); + printf(" (expected)\n"); } } } @@ -90,13 +114,19 @@ static int tm_signal_context_chk() } i = 0; - while (i < MAX_ATTEMPT && !fail) { + while (i < MAX_ATTEMPT && !broken) { + /* + * tm_signal_self_context_load will set both first and second + * contexts accordingly to the values passed through non-NULL + * array pointers to it, in that case 'vms', and invoke the + * signal handler installed for SIGUSR1. + */ rc = tm_signal_self_context_load(pid, NULL, NULL, vms, NULL); FAIL_IF(rc != pid); i++; } - return fail; + return (broken); } int main(void) diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c index 8c8677a408bb..8e25e2072ecd 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c @@ -5,10 +5,11 @@ * Test the kernel's signal frame code. * * The kernel sets up two sets of ucontexts if the signal was to be - * delivered while the thread was in a transaction. + * delivered while the thread was in a transaction (referred too as + * first and second contexts). * Expected behaviour is that the checkpointed state is in the user - * context passed to the signal handler. The speculated state can be - * accessed with the uc_link pointer. + * context passed to the signal handler (first context). The speculated + * state can be accessed with the uc_link pointer (second context). * * The rationale for this is that if TM unaware code (which linked * against TM libs) installs a signal handler it will not know of the @@ -29,17 +30,24 @@ #define MAX_ATTEMPT 500000 -#define NV_VSX_REGS 12 +#define NV_VSX_REGS 12 /* Number of VSX registers to check. */ +#define VSX20 20 /* First VSX register to check in vsr20-vsr31 subset */ +#define FPR20 20 /* FPR20 overlaps VSX20 most significant doubleword */ long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss); -static sig_atomic_t fail; +static sig_atomic_t fail, broken; -vector int vss[] = { - {1, 2, 3, 4 },{5, 6, 7, 8 },{9, 10,11,12}, +/* Test only 12 vsx registers from vsr20 to vsr31 */ +vector int vsxs[] = { + /* First context will be set with these values, i.e. non-speculative */ + /* VSX20 , VSX21 , ... */ + { 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12}, {13,14,15,16},{17,18,19,20},{21,22,23,24}, {25,26,27,28},{29,30,31,32},{33,34,35,36}, {37,38,39,40},{41,42,43,44},{45,46,47,48}, + /* Second context will be set with these values, i.e. speculative */ + /* VSX20 , VSX21 , ... */ {-1, -2, -3, -4 },{-5, -6, -7, -8 },{-9, -10,-11,-12}, {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24}, {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36}, @@ -48,41 +56,91 @@ vector int vss[] = { static void signal_usr1(int signum, siginfo_t *info, void *uc) { - int i; - uint8_t vsc[sizeof(vector int)]; - uint8_t vst[sizeof(vector int)]; + int i, j; + uint8_t vsx[sizeof(vector int)]; + uint8_t vsx_tm[sizeof(vector int)]; ucontext_t *ucp = uc; ucontext_t *tm_ucp = ucp->uc_link; /* - * The other half of the VSX regs will be after v_regs. + * FP registers and VMX registers overlap the VSX registers. + * + * FP registers (f0-31) overlap the most significant 64 bits of VSX + * registers vsr0-31, whilst VMX registers vr0-31, being 128-bit like + * the VSX registers, overlap fully the other half of VSX registers, + * i.e. vr0-31 overlaps fully vsr32-63. + * + * Due to compatibility and historical reasons (VMX/Altivec support + * appeared first on the architecture), VMX registers vr0-31 (so VSX + * half vsr32-63 too) are stored right after the v_regs pointer, in an + * area allocated for 'vmx_reverse' array (please see + * arch/powerpc/include/uapi/asm/sigcontext.h for details about the + * mcontext_t structure on Power). + * + * The other VSX half (vsr0-31) is hence stored below vr0-31/vsr32-63 + * registers, but only the least significant 64 bits of vsr0-31. The + * most significant 64 bits of vsr0-31 (f0-31), as it overlaps the FP + * registers, is kept in fp_regs. + * + * v_regs is a 16 byte aligned pointer at the start of vmx_reserve + * (vmx_reserve may or may not be 16 aligned) where the v_regs structure + * exists, so v_regs points to where vr0-31 / vsr32-63 registers are + * fully stored. Since v_regs type is elf_vrregset_t, v_regs + 1 + * skips all the slots used to store vr0-31 / vsr32-64 and points to + * part of one VSX half, i.e. v_regs + 1 points to the least significant + * 64 bits of vsr0-31. The other part of this half (the most significant + * part of vsr0-31) is stored in fp_regs. * - * In short, vmx_reserve array holds everything. v_regs is a 16 - * byte aligned pointer at the start of vmx_reserve (vmx_reserve - * may or may not be 16 aligned) where the v_regs structure exists. - * (half of) The VSX regsters are directly after v_regs so the - * easiest way to find them below. */ + /* Get pointer to least significant doubleword of vsr0-31 */ long *vsx_ptr = (long *)(ucp->uc_mcontext.v_regs + 1); long *tm_vsx_ptr = (long *)(tm_ucp->uc_mcontext.v_regs + 1); - for (i = 0; i < NV_VSX_REGS && !fail; i++) { - memcpy(vsc, &ucp->uc_mcontext.fp_regs[i + 20], 8); - memcpy(vsc + 8, &vsx_ptr[20 + i], 8); - fail = memcmp(vsc, &vss[i], sizeof(vector int)); - memcpy(vst, &tm_ucp->uc_mcontext.fp_regs[i + 20], 8); - memcpy(vst + 8, &tm_vsx_ptr[20 + i], 8); - fail |= memcmp(vst, &vss[i + NV_VSX_REGS], sizeof(vector int)); - if (fail) { - int j; + /* Check first context. Print all mismatches. */ + for (i = 0; i < NV_VSX_REGS; i++) { + /* + * Copy VSX most significant doubleword from fp_regs and + * copy VSX least significant one from 64-bit slots below + * saved VMX registers. + */ + memcpy(vsx, &ucp->uc_mcontext.fp_regs[FPR20 + i], 8); + memcpy(vsx + 8, &vsx_ptr[VSX20 + i], 8); + + fail = memcmp(vsx, &vsxs[i], sizeof(vector int)); - fprintf(stderr, "Failed on %d vsx 0x", i); + if (fail) { + broken = 1; + printf("VSX%d (1st context) == 0x", VSX20 + i); for (j = 0; j < 16; j++) - fprintf(stderr, "%02x", vsc[j]); - fprintf(stderr, " vs 0x"); + printf("%02x", vsx[j]); + printf(" instead of 0x"); + for (j = 0; j < 4; j++) + printf("%08x", vsxs[i][j]); + printf(" (expected)\n"); + } + } + + /* Check second context. Print all mismatches. */ + for (i = 0; i < NV_VSX_REGS; i++) { + /* + * Copy VSX most significant doubleword from fp_regs and + * copy VSX least significant one from 64-bit slots below + * saved VMX registers. + */ + memcpy(vsx_tm, &tm_ucp->uc_mcontext.fp_regs[FPR20 + i], 8); + memcpy(vsx_tm + 8, &tm_vsx_ptr[VSX20 + i], 8); + + fail = memcmp(vsx_tm, &vsxs[NV_VSX_REGS + i], sizeof(vector int)); + + if (fail) { + broken = 1; + printf("VSX%d (2nd context) == 0x", VSX20 + i); for (j = 0; j < 16; j++) - fprintf(stderr, "%02x", vst[j]); - fprintf(stderr, "\n"); + printf("%02x", vsx_tm[j]); + printf(" instead of 0x"); + for (j = 0; j < 4; j++) + printf("%08x", vsxs[NV_VSX_REGS + i][j]); + printf("(expected)\n"); } } } @@ -105,13 +163,19 @@ static int tm_signal_context_chk() } i = 0; - while (i < MAX_ATTEMPT && !fail) { - rc = tm_signal_self_context_load(pid, NULL, NULL, NULL, vss); + while (i < MAX_ATTEMPT && !broken) { + /* + * tm_signal_self_context_load will set both first and second + * contexts accordingly to the values passed through non-NULL + * array pointers to it, in that case 'vsxs', and invoke the + * signal handler installed for SIGUSR1. + */ + rc = tm_signal_self_context_load(pid, NULL, NULL, NULL, vsxs); FAIL_IF(rc != pid); i++; } - return fail; + return (broken); } int main(void) -- cgit v1.2.3 From 6652bf6408895b09d31fd4128a1589a1a0672823 Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Tue, 20 Aug 2019 18:54:11 -0400 Subject: selftests/powerpc: Retry on host facility unavailable TM test tm-unavailable must take into account aborts due to host aborting a transactin because of a facility unavailable exception, just like it already does for aborts on reschedules (TM_CAUSE_KVM_RESCHED). Reported-by: Desnes A. Nunes do Rosario Tested-by: Desnes A. Nunes do Rosario Signed-off-by: Gustavo Romero Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1566341651-19747-1-git-send-email-gromero@linux.vnet.ibm.com --- tools/testing/selftests/powerpc/tm/tm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h index 97f9f491c541..c402464b038f 100644 --- a/tools/testing/selftests/powerpc/tm/tm.h +++ b/tools/testing/selftests/powerpc/tm/tm.h @@ -55,7 +55,8 @@ static inline bool failure_is_unavailable(void) static inline bool failure_is_reschedule(void) { if ((failure_code() & TM_CAUSE_RESCHED) == TM_CAUSE_RESCHED || - (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED) + (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED || + (failure_code() & TM_CAUSE_KVM_FAC_UNAV) == TM_CAUSE_KVM_FAC_UNAV) return true; return false; -- cgit v1.2.3 From d8f0e0b073e1ec52a05f0c2a56318b47387d2f10 Mon Sep 17 00:00:00 2001 From: "Christopher M. Riedl" Date: Thu, 23 May 2019 21:46:48 -0500 Subject: powerpc/64s: support nospectre_v2 cmdline option Add support for disabling the kernel implemented spectre v2 mitigation (count cache flush on context switch) via the nospectre_v2 and mitigations=off cmdline options. Suggested-by: Michael Ellerman Signed-off-by: Christopher M. Riedl Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190524024647.381-1-cmr@informatik.wtf --- arch/powerpc/kernel/security.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index e1c9cf079503..7cfcb294b11c 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -28,7 +28,7 @@ static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NO bool barrier_nospec_enabled; static bool no_nospec; static bool btb_flush_enabled; -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static bool no_spectrev2; #endif @@ -114,7 +114,7 @@ static __init int security_feature_debugfs_init(void) device_initcall(security_feature_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static int __init handle_nospectre_v2(char *p) { no_spectrev2 = true; @@ -122,6 +122,9 @@ static int __init handle_nospectre_v2(char *p) return 0; } early_param("nospectre_v2", handle_nospectre_v2); +#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E void setup_spectre_v2(void) { if (no_spectrev2 || cpu_mitigations_off()) @@ -399,7 +402,17 @@ static void toggle_count_cache_flush(bool enable) void setup_count_cache_flush(void) { - toggle_count_cache_flush(true); + bool enable = true; + + if (no_spectrev2 || cpu_mitigations_off()) { + if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) || + security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED)) + pr_warn("Spectre v2 mitigations not under software control, can't disable\n"); + + enable = false; + } + + toggle_count_cache_flush(enable); } #ifdef CONFIG_DEBUG_FS -- cgit v1.2.3 From b4645ffc49cfe34f67feda20c34bd7a859c78312 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 22 Aug 2019 16:44:05 +0000 Subject: powerpc/64: don't select ARCH_HAS_SCALED_CPUTIME on book3E Book3E doesn't have SPRN_SPURR/SPRN_PURR. Activating ARCH_HAS_SCALED_CPUTIME is just wasting CPU time. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://github.com/linuxppc/issues/issues/171 Link: https://lore.kernel.org/r/a8b567c569aa521a7cf1beb061d43d79070e850c.1566492229.git.christophe.leroy@c-s.fr --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 78f52030e76a..3d65084f4066 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -133,7 +133,7 @@ config PPC select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS - select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC64 + select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64 select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE -- cgit v1.2.3 From f0f8d7ae3924ed93453e30123e4aaf6f888ca555 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Aug 2019 14:07:09 +0000 Subject: powerpc: remove the ppc44x ocm.c file The on chip memory allocator is entirely unused in the kernel tree. Signed-off-by: Christoph Hellwig Acked-by: Christophe Leroy Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7b1668941ad1041d08b19167030868de5840b153.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/configs/ppc40x_defconfig | 1 - arch/powerpc/include/asm/ppc4xx_ocm.h | 31 --- arch/powerpc/platforms/44x/Kconfig | 8 - arch/powerpc/platforms/4xx/Makefile | 1 - arch/powerpc/platforms/4xx/ocm.c | 390 ---------------------------------- 5 files changed, 431 deletions(-) delete mode 100644 arch/powerpc/include/asm/ppc4xx_ocm.h delete mode 100644 arch/powerpc/platforms/4xx/ocm.c diff --git a/arch/powerpc/configs/ppc40x_defconfig b/arch/powerpc/configs/ppc40x_defconfig index 8f136b52198b..a5f683aed328 100644 --- a/arch/powerpc/configs/ppc40x_defconfig +++ b/arch/powerpc/configs/ppc40x_defconfig @@ -84,4 +84,3 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_PCBC=y CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_DES=y -CONFIG_PPC4xx_OCM=y diff --git a/arch/powerpc/include/asm/ppc4xx_ocm.h b/arch/powerpc/include/asm/ppc4xx_ocm.h deleted file mode 100644 index fc4db6dcde84..000000000000 --- a/arch/powerpc/include/asm/ppc4xx_ocm.h +++ /dev/null @@ -1,31 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * PowerPC 4xx OCM memory allocation support - * - * (C) Copyright 2009, Applied Micro Circuits Corporation - * Victor Gallardo (vgallardo@amcc.com) - * - * See file CREDITS for list of people who contributed to this - * project. - */ - -#ifndef __ASM_POWERPC_PPC4XX_OCM_H__ -#define __ASM_POWERPC_PPC4XX_OCM_H__ - -#define PPC4XX_OCM_NON_CACHED 0 -#define PPC4XX_OCM_CACHED 1 - -#if defined(CONFIG_PPC4xx_OCM) - -void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align, - int flags, const char *owner); -void ppc4xx_ocm_free(const void *virt); - -#else - -#define ppc4xx_ocm_alloc(phys, size, align, flags, owner) NULL -#define ppc4xx_ocm_free(addr) ((void)0) - -#endif /* CONFIG_PPC4xx_OCM */ - -#endif /* __ASM_POWERPC_PPC4XX_OCM_H__ */ diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index b369ed4e3675..25ebe634a661 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -272,14 +272,6 @@ config PPC4xx_GPIO help Enable gpiolib support for ppc440 based boards -config PPC4xx_OCM - bool "PPC4xx On Chip Memory (OCM) support" - depends on 4xx - select PPC_LIB_RHEAP - help - Enable OCM support for PowerPC 4xx platforms with on chip memory, - OCM provides the fast place for memory access to improve performance. - # 44x specific CPU modules, selected based on the board above. config 440EP bool diff --git a/arch/powerpc/platforms/4xx/Makefile b/arch/powerpc/platforms/4xx/Makefile index f5ae27ca131b..d009d2e0b9e8 100644 --- a/arch/powerpc/platforms/4xx/Makefile +++ b/arch/powerpc/platforms/4xx/Makefile @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += uic.o machine_check.o -obj-$(CONFIG_PPC4xx_OCM) += ocm.o obj-$(CONFIG_4xx_SOC) += soc.o obj-$(CONFIG_PCI) += pci.o obj-$(CONFIG_PPC4xx_HSTA_MSI) += hsta_msi.o diff --git a/arch/powerpc/platforms/4xx/ocm.c b/arch/powerpc/platforms/4xx/ocm.c deleted file mode 100644 index ba3257406ced..000000000000 --- a/arch/powerpc/platforms/4xx/ocm.c +++ /dev/null @@ -1,390 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * PowerPC 4xx OCM memory allocation support - * - * (C) Copyright 2009, Applied Micro Circuits Corporation - * Victor Gallardo (vgallardo@amcc.com) - * - * See file CREDITS for list of people who contributed to this - * project. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define OCM_DISABLED 0 -#define OCM_ENABLED 1 - -struct ocm_block { - struct list_head list; - void __iomem *addr; - int size; - const char *owner; -}; - -/* non-cached or cached region */ -struct ocm_region { - phys_addr_t phys; - void __iomem *virt; - - int memtotal; - int memfree; - - rh_info_t *rh; - struct list_head list; -}; - -struct ocm_info { - int index; - int status; - int ready; - - phys_addr_t phys; - - int alignment; - int memtotal; - int cache_size; - - struct ocm_region nc; /* non-cached region */ - struct ocm_region c; /* cached region */ -}; - -static struct ocm_info *ocm_nodes; -static int ocm_count; - -static struct ocm_info *ocm_get_node(unsigned int index) -{ - if (index >= ocm_count) { - printk(KERN_ERR "PPC4XX OCM: invalid index"); - return NULL; - } - - return &ocm_nodes[index]; -} - -static int ocm_free_region(struct ocm_region *ocm_reg, const void *addr) -{ - struct ocm_block *blk, *tmp; - unsigned long offset; - - if (!ocm_reg->virt) - return 0; - - list_for_each_entry_safe(blk, tmp, &ocm_reg->list, list) { - if (blk->addr == addr) { - offset = addr - ocm_reg->virt; - ocm_reg->memfree += blk->size; - rh_free(ocm_reg->rh, offset); - list_del(&blk->list); - kfree(blk); - return 1; - } - } - - return 0; -} - -static void __init ocm_init_node(int count, struct device_node *node) -{ - struct ocm_info *ocm; - - const unsigned int *cell_index; - const unsigned int *cache_size; - int len; - - struct resource rsrc; - - ocm = ocm_get_node(count); - - cell_index = of_get_property(node, "cell-index", &len); - if (!cell_index) { - printk(KERN_ERR "PPC4XX OCM: missing cell-index property"); - return; - } - ocm->index = *cell_index; - - if (of_device_is_available(node)) - ocm->status = OCM_ENABLED; - - cache_size = of_get_property(node, "cached-region-size", &len); - if (cache_size) - ocm->cache_size = *cache_size; - - if (of_address_to_resource(node, 0, &rsrc)) { - printk(KERN_ERR "PPC4XX OCM%d: could not get resource address\n", - ocm->index); - return; - } - - ocm->phys = rsrc.start; - ocm->memtotal = (rsrc.end - rsrc.start + 1); - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (%s)\n", - ocm->index, ocm->memtotal, - (ocm->status == OCM_DISABLED) ? "disabled" : "enabled"); - - if (ocm->status == OCM_DISABLED) - return; - - /* request region */ - - if (!request_mem_region(ocm->phys, ocm->memtotal, "ppc4xx_ocm")) { - printk(KERN_ERR "PPC4XX OCM%d: could not request region\n", - ocm->index); - return; - } - - /* Configure non-cached and cached regions */ - - ocm->nc.phys = ocm->phys; - ocm->nc.memtotal = ocm->memtotal - ocm->cache_size; - ocm->nc.memfree = ocm->nc.memtotal; - - ocm->c.phys = ocm->phys + ocm->nc.memtotal; - ocm->c.memtotal = ocm->cache_size; - ocm->c.memfree = ocm->c.memtotal; - - if (ocm->nc.memtotal == 0) - ocm->nc.phys = 0; - - if (ocm->c.memtotal == 0) - ocm->c.phys = 0; - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (non-cached)\n", - ocm->index, ocm->nc.memtotal); - - printk(KERN_INFO "PPC4XX OCM%d: %d Bytes (cached)\n", - ocm->index, ocm->c.memtotal); - - /* ioremap the non-cached region */ - if (ocm->nc.memtotal) { - ocm->nc.virt = __ioremap(ocm->nc.phys, ocm->nc.memtotal, - _PAGE_EXEC | pgprot_val(PAGE_KERNEL_NCG)); - - if (!ocm->nc.virt) { - printk(KERN_ERR - "PPC4XX OCM%d: failed to ioremap non-cached memory\n", - ocm->index); - ocm->nc.memfree = 0; - return; - } - } - - /* ioremap the cached region */ - - if (ocm->c.memtotal) { - ocm->c.virt = __ioremap(ocm->c.phys, ocm->c.memtotal, - _PAGE_EXEC | pgprot_val(PAGE_KERNEL)); - - if (!ocm->c.virt) { - printk(KERN_ERR - "PPC4XX OCM%d: failed to ioremap cached memory\n", - ocm->index); - ocm->c.memfree = 0; - return; - } - } - - /* Create Remote Heaps */ - - ocm->alignment = 4; /* default 4 byte alignment */ - - if (ocm->nc.virt) { - ocm->nc.rh = rh_create(ocm->alignment); - rh_attach_region(ocm->nc.rh, 0, ocm->nc.memtotal); - } - - if (ocm->c.virt) { - ocm->c.rh = rh_create(ocm->alignment); - rh_attach_region(ocm->c.rh, 0, ocm->c.memtotal); - } - - INIT_LIST_HEAD(&ocm->nc.list); - INIT_LIST_HEAD(&ocm->c.list); - - ocm->ready = 1; -} - -static int ocm_debugfs_show(struct seq_file *m, void *v) -{ - struct ocm_block *blk, *tmp; - unsigned int i; - - for (i = 0; i < ocm_count; i++) { - struct ocm_info *ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - seq_printf(m, "PPC4XX OCM : %d\n", ocm->index); - seq_printf(m, "PhysAddr : %pa\n", &(ocm->phys)); - seq_printf(m, "MemTotal : %d Bytes\n", ocm->memtotal); - seq_printf(m, "MemTotal(NC) : %d Bytes\n", ocm->nc.memtotal); - seq_printf(m, "MemTotal(C) : %d Bytes\n\n", ocm->c.memtotal); - - seq_printf(m, "NC.PhysAddr : %pa\n", &(ocm->nc.phys)); - seq_printf(m, "NC.VirtAddr : 0x%p\n", ocm->nc.virt); - seq_printf(m, "NC.MemTotal : %d Bytes\n", ocm->nc.memtotal); - seq_printf(m, "NC.MemFree : %d Bytes\n", ocm->nc.memfree); - - list_for_each_entry_safe(blk, tmp, &ocm->nc.list, list) { - seq_printf(m, "NC.MemUsed : %d Bytes (%s)\n", - blk->size, blk->owner); - } - - seq_printf(m, "\nC.PhysAddr : %pa\n", &(ocm->c.phys)); - seq_printf(m, "C.VirtAddr : 0x%p\n", ocm->c.virt); - seq_printf(m, "C.MemTotal : %d Bytes\n", ocm->c.memtotal); - seq_printf(m, "C.MemFree : %d Bytes\n", ocm->c.memfree); - - list_for_each_entry_safe(blk, tmp, &ocm->c.list, list) { - seq_printf(m, "C.MemUsed : %d Bytes (%s)\n", - blk->size, blk->owner); - } - - seq_putc(m, '\n'); - } - - return 0; -} - -static int ocm_debugfs_open(struct inode *inode, struct file *file) -{ - return single_open(file, ocm_debugfs_show, NULL); -} - -static const struct file_operations ocm_debugfs_fops = { - .open = ocm_debugfs_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int ocm_debugfs_init(void) -{ - struct dentry *junk; - - junk = debugfs_create_dir("ppc4xx_ocm", 0); - if (!junk) { - printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create dir\n"); - return -1; - } - - if (debugfs_create_file("info", 0644, junk, NULL, &ocm_debugfs_fops)) { - printk(KERN_ALERT "debugfs ppc4xx ocm: failed to create file\n"); - return -1; - } - - return 0; -} - -void *ppc4xx_ocm_alloc(phys_addr_t *phys, int size, int align, - int flags, const char *owner) -{ - void __iomem *addr = NULL; - unsigned long offset; - struct ocm_info *ocm; - struct ocm_region *ocm_reg; - struct ocm_block *ocm_blk; - int i; - - for (i = 0; i < ocm_count; i++) { - ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - if (flags == PPC4XX_OCM_NON_CACHED) - ocm_reg = &ocm->nc; - else - ocm_reg = &ocm->c; - - if (!ocm_reg->virt) - continue; - - if (align < ocm->alignment) - align = ocm->alignment; - - offset = rh_alloc_align(ocm_reg->rh, size, align, NULL); - - if (IS_ERR_VALUE(offset)) - continue; - - ocm_blk = kzalloc(sizeof(*ocm_blk), GFP_KERNEL); - if (!ocm_blk) { - rh_free(ocm_reg->rh, offset); - break; - } - - *phys = ocm_reg->phys + offset; - addr = ocm_reg->virt + offset; - size = ALIGN(size, align); - - ocm_blk->addr = addr; - ocm_blk->size = size; - ocm_blk->owner = owner; - list_add_tail(&ocm_blk->list, &ocm_reg->list); - - ocm_reg->memfree -= size; - - break; - } - - return addr; -} - -void ppc4xx_ocm_free(const void *addr) -{ - int i; - - if (!addr) - return; - - for (i = 0; i < ocm_count; i++) { - struct ocm_info *ocm = ocm_get_node(i); - - if (!ocm || !ocm->ready) - continue; - - if (ocm_free_region(&ocm->nc, addr) || - ocm_free_region(&ocm->c, addr)) - return; - } -} - -static int __init ppc4xx_ocm_init(void) -{ - struct device_node *np; - int count; - - count = 0; - for_each_compatible_node(np, NULL, "ibm,ocm") - count++; - - if (!count) - return 0; - - ocm_nodes = kzalloc((count * sizeof(struct ocm_info)), GFP_KERNEL); - if (!ocm_nodes) - return -ENOMEM; - - ocm_count = count; - count = 0; - - for_each_compatible_node(np, NULL, "ibm,ocm") { - ocm_init_node(count, np); - count++; - } - - ocm_debugfs_init(); - - return 0; -} - -arch_initcall(ppc4xx_ocm_init); -- cgit v1.2.3 From 6f57e6631d7c6ae34a5e811f7f49cb63647a9cab Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:10 +0000 Subject: powerpc/ps3: replace __ioremap() by ioremap_prot() __ioremap() is similar to ioremap_prot() except that ioremap_prot() does a few sanity changes in addition. The flags used by PS3 are not impacted by those changes so for PS3 both functions are equivalent. At the same time, drop parts of the comment that have been invalid since commit e58e87adc8bf ("powerpc/mm: Update _PAGE_KERNEL_RO") Suggested-by: Christoph Hellwig Signed-off-by: Christophe Leroy Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/36bff5d875ff562889c5e12dab63e5d7c5d1fbd8.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/platforms/ps3/spu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index bdaeaecdc06b..1193c294b8d0 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -184,10 +184,7 @@ static void spu_unmap(struct spu *spu) * setup_areas - Map the spu regions into the address space. * * The current HV requires the spu shadow regs to be mapped with the - * PTE page protection bits set as read-only (PP=3). This implementation - * uses the low level __ioremap() to bypass the page protection settings - * inforced by ioremap_prot() to get the needed PTE bits set for the - * shadow regs. + * PTE page protection bits set as read-only. */ static int __init setup_areas(struct spu *spu) @@ -195,9 +192,8 @@ static int __init setup_areas(struct spu *spu) struct table {char* name; unsigned long addr; unsigned long size;}; unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); - spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, - sizeof(struct spe_shadow), - shadow_flags); + spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr, + sizeof(struct spe_shadow), shadow_flags); if (!spu_pdata(spu)->shadow) { pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__); goto fail_ioremap; -- cgit v1.2.3 From 8aee077292a34eaca717445290f12d23f8bd7732 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:11 +0000 Subject: powerpc/mm: drop ppc_md.iounmap() and __iounmap() ppc_md.iounmap() is never set, drop it. Once ppc_md.iounmap() is gone, iounmap() remains the only user of __iounmap() and iounmap() does nothing else than calling __iounmap(). So drop iounmap() and make __iounmap() the new iounmap(). Reviewed-by: Christoph Hellwig Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d73ba92bb7a387cc58cc34666d7f5158a45851b0.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/io.h | 5 ----- arch/powerpc/include/asm/machdep.h | 2 -- arch/powerpc/mm/pgtable_64.c | 11 +---------- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 23e5d5d16c7e..02d6256fe1ea 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -712,9 +712,6 @@ static inline void iosync(void) * * __ioremap_caller is the same as above but takes an explicit caller * reference rather than using __builtin_return_address(0) * - * * __iounmap, is the low level implementation used by iounmap and cannot - * be hooked (but can be used by a hook on iounmap) - * */ extern void __iomem *ioremap(phys_addr_t address, unsigned long size); extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, @@ -734,8 +731,6 @@ extern void __iomem *__ioremap(phys_addr_t, unsigned long size, extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); -extern void __iounmap(volatile void __iomem *addr); - extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot); extern void __iounmap_at(void *ea, unsigned long size); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index c43d6eca9edd..3370df4bdaa0 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -33,8 +33,6 @@ struct machdep_calls { #ifdef CONFIG_PPC64 void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller); - void (*iounmap)(volatile void __iomem *token); - #ifdef CONFIG_PM void (*iommu_save)(void); void (*iommu_restore)(void); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 9ad59b733984..57cdd6182932 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -266,7 +266,7 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, * Unmap an IO region and remove it from imalloc'd list. * Access to IO memory should be serialized by driver. */ -void __iounmap(volatile void __iomem *token) +void iounmap(volatile void __iomem *token) { void *addr; @@ -283,21 +283,12 @@ void __iounmap(volatile void __iomem *token) vunmap(addr); } -void iounmap(volatile void __iomem *token) -{ - if (ppc_md.iounmap) - ppc_md.iounmap(token); - else - __iounmap(token); -} - EXPORT_SYMBOL(ioremap); EXPORT_SYMBOL(ioremap_wc); EXPORT_SYMBOL(ioremap_prot); EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(__ioremap_at); EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(__iounmap); EXPORT_SYMBOL(__iounmap_at); #ifndef __PAGETABLE_PUD_FOLDED -- cgit v1.2.3 From 492643e81e58e7585cccb22c37814060e3f59268 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:12 +0000 Subject: powerpc/mm: drop function __ioremap() __ioremap() is not used anymore, drop it. Suggested-by: Christoph Hellwig Signed-off-by: Christophe Leroy Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ccc439f481a0884e00a6be1bab44bab2a4477fea.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/io.h | 6 ------ arch/powerpc/mm/pgtable_32.c | 11 ++--------- arch/powerpc/mm/pgtable_64.c | 7 ------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 02d6256fe1ea..8e65ba59f06a 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -705,10 +705,6 @@ static inline void iosync(void) * create hand-made mappings for use only by the PCI code and cannot * currently be hooked. Must be page aligned. * - * * __ioremap is the low level implementation used by ioremap and - * ioremap_prot and cannot be hooked (but can be used by a hook on one - * of the previous ones) - * * * __ioremap_caller is the same as above but takes an explicit caller * reference rather than using __builtin_return_address(0) * @@ -726,8 +722,6 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); extern void iounmap(volatile void __iomem *addr); -extern void __iomem *__ioremap(phys_addr_t, unsigned long size, - unsigned long flags); extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 97f401a06fcc..8cc5e9e83fc3 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -91,12 +91,6 @@ ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) } EXPORT_SYMBOL(ioremap_prot); -void __iomem * -__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) -{ - return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); -} - void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { @@ -127,8 +121,8 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call */ if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && page_is_ram(__phys_to_pfn(p))) { - printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n", - (unsigned long long)p, __builtin_return_address(0)); + pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__, + (unsigned long long)p, __builtin_return_address(0)); return NULL; } #endif @@ -171,7 +165,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call out: return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK)); } -EXPORT_SYMBOL(__ioremap); void iounmap(volatile void __iomem *addr) { diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 57cdd6182932..2882419737b9 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -203,12 +203,6 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, return ret; } -void __iomem * __ioremap(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - return __ioremap_caller(addr, size, __pgprot(flags), __builtin_return_address(0)); -} - void __iomem * ioremap(phys_addr_t addr, unsigned long size) { pgprot_t prot = pgprot_noncached(PAGE_KERNEL); @@ -286,7 +280,6 @@ void iounmap(volatile void __iomem *token) EXPORT_SYMBOL(ioremap); EXPORT_SYMBOL(ioremap_wc); EXPORT_SYMBOL(ioremap_prot); -EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(__ioremap_at); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(__iounmap_at); -- cgit v1.2.3 From 14b4d97669b79d1ac83e64d6795098394e15ab1b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:13 +0000 Subject: powerpc/mm: rework io-workaround invocation. ppc_md.ioremap() is only used for I/O workaround on CELL platform, so indirect function call can be avoided. This patch reworks the io-workaround and ioremap() functions to use the global 'io_workaround_inited' flag for the activation of io-workaround. When CONFIG_PPC_IO_WORKAROUNDS or CONFIG_PPC_INDIRECT_MMIO are not selected, the I/O workaround ioremap() voids and the global flag is not used. Signed-off-by: Christophe Leroy Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5fa3ef069fbd0f152512afaae19e7a60161454cf.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/io-workarounds.h | 20 ++++++++++++++++++++ arch/powerpc/include/asm/machdep.h | 2 -- arch/powerpc/kernel/io-workarounds.c | 13 +++++-------- arch/powerpc/mm/pgtable_64.c | 17 +++++++++-------- 4 files changed, 34 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/io-workarounds.h b/arch/powerpc/include/asm/io-workarounds.h index 01567ea4ceaf..3cce499fbe27 100644 --- a/arch/powerpc/include/asm/io-workarounds.h +++ b/arch/powerpc/include/asm/io-workarounds.h @@ -8,6 +8,7 @@ #ifndef _IO_WORKAROUNDS_H #define _IO_WORKAROUNDS_H +#ifdef CONFIG_PPC_IO_WORKAROUNDS #include #include @@ -32,4 +33,23 @@ extern int spiderpci_iowa_init(struct iowa_bus *, void *); #define SPIDER_PCI_DUMMY_READ 0x0810 #define SPIDER_PCI_DUMMY_READ_BASE 0x0814 +#endif + +#if defined(CONFIG_PPC_IO_WORKAROUNDS) && defined(CONFIG_PPC_INDIRECT_MMIO) +extern bool io_workaround_inited; + +static inline bool iowa_is_active(void) +{ + return unlikely(io_workaround_inited); +} +#else +static inline bool iowa_is_active(void) +{ + return false; +} +#endif + +void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller); + #endif /* _IO_WORKAROUNDS_H */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 3370df4bdaa0..657ec893bdcb 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -31,8 +31,6 @@ struct pci_host_bridge; struct machdep_calls { char *name; #ifdef CONFIG_PPC64 - void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size, - pgprot_t prot, void *caller); #ifdef CONFIG_PM void (*iommu_save)(void); void (*iommu_restore)(void); diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index fbd2d0007c52..0276bc8c8969 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -149,8 +149,8 @@ static const struct ppc_pci_io iowa_pci_io = { }; #ifdef CONFIG_PPC_INDIRECT_MMIO -static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, - pgprot_t prot, void *caller) +void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller) { struct iowa_bus *bus; void __iomem *res = __ioremap_caller(addr, size, prot, caller); @@ -163,20 +163,17 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, } return res; } -#else /* CONFIG_PPC_INDIRECT_MMIO */ -#define iowa_ioremap NULL #endif /* !CONFIG_PPC_INDIRECT_MMIO */ +bool io_workaround_inited; + /* Enable IO workaround */ static void io_workaround_init(void) { - static int io_workaround_inited; - if (io_workaround_inited) return; ppc_pci_io = iowa_pci_io; - ppc_md.ioremap = iowa_ioremap; - io_workaround_inited = 1; + io_workaround_inited = true; } /* Register new bus to support workaround */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 2882419737b9..0a147daeb0f2 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -208,8 +209,8 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size) pgprot_t prot = pgprot_noncached(PAGE_KERNEL); void *caller = __builtin_return_address(0); - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); return __ioremap_caller(addr, size, prot, caller); } @@ -218,8 +219,8 @@ void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); void *caller = __builtin_return_address(0); - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); return __ioremap_caller(addr, size, prot, caller); } @@ -228,8 +229,8 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) pgprot_t prot = pgprot_cached(PAGE_KERNEL); void *caller = __builtin_return_address(0); - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, prot, caller); + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); return __ioremap_caller(addr, size, prot, caller); } @@ -250,8 +251,8 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, */ pte = pte_mkprivileged(pte); - if (ppc_md.ioremap) - return ppc_md.ioremap(addr, size, pte_pgprot(pte), caller); + if (iowa_is_active()) + return iowa_ioremap(addr, size, pte_pgprot(pte), caller); return __ioremap_caller(addr, size, pte_pgprot(pte), caller); } -- cgit v1.2.3 From 4634c375db7a082b2522621519a5fb6eba977584 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:14 +0000 Subject: powerpc/mm: move common 32/64 bits ioremap functions into ioremap.c ioremap(), ioremap_wc() and ioremap_coherent() are now identical on PPC32 and PPC64 as iowa_is_active() will always return false on PPC32. Move them into a new common location called ioremap.c Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/6223803ce024d6ab4dfaa919f44098aed5b4bc33.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/ioremap.c | 36 ++++++++++++++++++++++++++++++++++++ arch/powerpc/mm/pgtable_32.c | 27 --------------------------- arch/powerpc/mm/pgtable_64.c | 33 --------------------------------- 4 files changed, 37 insertions(+), 61 deletions(-) create mode 100644 arch/powerpc/mm/ioremap.c diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 0f499db315d6..29c682fe9144 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -7,7 +7,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ - pgtable-frag.o \ + pgtable-frag.o ioremap.o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c new file mode 100644 index 000000000000..7f1d462e7745 --- /dev/null +++ b/arch/powerpc/mm/ioremap.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include + +void __iomem *ioremap(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap); + +void __iomem *ioremap_wc(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached(PAGE_KERNEL); + void *caller = __builtin_return_address(0); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, prot, caller); + return __ioremap_caller(addr, size, prot, caller); +} diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 8cc5e9e83fc3..a363d87d0a9d 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -38,24 +38,6 @@ EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ extern char etext[], _stext[], _sinittext[], _einittext[]; -void __iomem * -ioremap(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap); - -void __iomem * -ioremap_wc(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_wc); - void __iomem * ioremap_wt(phys_addr_t addr, unsigned long size) { @@ -65,15 +47,6 @@ ioremap_wt(phys_addr_t addr, unsigned long size) } EXPORT_SYMBOL(ioremap_wt); -void __iomem * -ioremap_coherent(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_coherent); - void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 0a147daeb0f2..358233ea8d85 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -204,36 +203,6 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, return ret; } -void __iomem * ioremap(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (iowa_is_active()) - return iowa_ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_noncached_wc(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (iowa_is_active()) - return iowa_ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - -void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached(PAGE_KERNEL); - void *caller = __builtin_return_address(0); - - if (iowa_is_active()) - return iowa_ioremap(addr, size, prot, caller); - return __ioremap_caller(addr, size, prot, caller); -} - void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) { @@ -278,8 +247,6 @@ void iounmap(volatile void __iomem *token) vunmap(addr); } -EXPORT_SYMBOL(ioremap); -EXPORT_SYMBOL(ioremap_wc); EXPORT_SYMBOL(ioremap_prot); EXPORT_SYMBOL(__ioremap_at); EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From edfe1a5679263827ff94eb478c6ee0d65d467adf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:15 +0000 Subject: powerpc/mm: move ioremap_prot() into ioremap.c Both ioremap_prot() are idenfical, move them into ioremap.c Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0b3eb0e0f1490a99fd6c983e166fb8946233f151.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/ioremap.c | 19 +++++++++++++++++++ arch/powerpc/mm/pgtable_32.c | 17 ----------------- arch/powerpc/mm/pgtable_64.c | 24 ------------------------ 3 files changed, 19 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 7f1d462e7745..3d388aaa3ee6 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -34,3 +34,22 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) return iowa_ioremap(addr, size, prot, caller); return __ioremap_caller(addr, size, prot, caller); } + +void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +{ + pte_t pte = __pte(flags); + void *caller = __builtin_return_address(0); + + /* writeable implies dirty for kernel addresses */ + if (pte_write(pte)) + pte = pte_mkdirty(pte); + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + pte = pte_exprotect(pte); + pte = pte_mkprivileged(pte); + + if (iowa_is_active()) + return iowa_ioremap(addr, size, pte_pgprot(pte), caller); + return __ioremap_caller(addr, size, pte_pgprot(pte), caller); +} +EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index a363d87d0a9d..ac5a267467fa 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -47,23 +47,6 @@ ioremap_wt(phys_addr_t addr, unsigned long size) } EXPORT_SYMBOL(ioremap_wt); -void __iomem * -ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) -{ - pte_t pte = __pte(flags); - - /* writeable implies dirty for kernel addresses */ - if (pte_write(pte)) - pte = pte_mkdirty(pte); - - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - pte = pte_mkprivileged(pte); - - return __ioremap_caller(addr, size, pte_pgprot(pte), __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_prot); - void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 358233ea8d85..2b9078e1bc43 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -203,29 +203,6 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, return ret; } -void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, - unsigned long flags) -{ - pte_t pte = __pte(flags); - void *caller = __builtin_return_address(0); - - /* writeable implies dirty for kernel addresses */ - if (pte_write(pte)) - pte = pte_mkdirty(pte); - - /* we don't want to let _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); - /* - * Force kernel mapping. - */ - pte = pte_mkprivileged(pte); - - if (iowa_is_active()) - return iowa_ioremap(addr, size, pte_pgprot(pte), caller); - return __ioremap_caller(addr, size, pte_pgprot(pte), caller); -} - - /* * Unmap an IO region and remove it from imalloc'd list. * Access to IO memory should be serialized by driver. @@ -247,7 +224,6 @@ void iounmap(volatile void __iomem *token) vunmap(addr); } -EXPORT_SYMBOL(ioremap_prot); EXPORT_SYMBOL(__ioremap_at); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(__iounmap_at); -- cgit v1.2.3 From 7cd9b317b630683b0c8eb2dfcfb046003ad6b97b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:16 +0000 Subject: powerpc/mm: make ioremap_bot common to all Drop multiple definitions of ioremap_bot and make one common to all subarches. Only CONFIG_PPC_BOOK3E_64 had a global static init value for ioremap_bot. Now ioremap_bot is set in early_init_mmu_global(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/920eebfd9f36f14c79d1755847f5bf7c83703bdd.1566309262.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/pgtable.h | 2 -- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/powerpc/include/asm/nohash/32/pgtable.h | 2 -- arch/powerpc/include/asm/pgtable.h | 2 ++ arch/powerpc/mm/ioremap.c | 3 +++ arch/powerpc/mm/mmu_decl.h | 1 - arch/powerpc/mm/nohash/tlb.c | 2 ++ arch/powerpc/mm/pgtable_32.c | 3 --- arch/powerpc/mm/pgtable_64.c | 3 --- 9 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 838de59f6754..aa1bc5f8da90 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -201,8 +201,6 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #include #include -extern unsigned long ioremap_bot; - /* Bits to mask out from a PGD to get to the PUD page */ #define PGD_MASKED_BITS 0 diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8e47fb85dfa6..03c9a14dd902 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -289,7 +289,6 @@ extern unsigned long __kernel_io_end; #define KERN_IO_END __kernel_io_end extern struct page *vmemmap; -extern unsigned long ioremap_bot; extern unsigned long pci_io_base; #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 0284f8f5305f..7ce2a7c9fade 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -11,8 +11,6 @@ #include /* For sub-arch specific PPC_PIN_SIZE */ #include -extern unsigned long ioremap_bot; - #ifdef CONFIG_44x extern int icache_44x_need_flush; #endif diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index c70916a7865a..8b7865a2d576 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -68,6 +68,8 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); +extern unsigned long ioremap_bot; + /* * kern_addr_valid is intended to indicate whether an address is a valid * kernel address. Most 32-bit archs define it as always true (like this) diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 3d388aaa3ee6..eaf5f8a4a63f 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -3,6 +3,9 @@ #include #include +unsigned long ioremap_bot; +EXPORT_SYMBOL(ioremap_bot); + void __iomem *ioremap(phys_addr_t addr, unsigned long size) { pgprot_t prot = pgprot_noncached(PAGE_KERNEL); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index adbaf2167214..c750ac9ec713 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -106,7 +106,6 @@ extern u8 early_hash[]; #endif /* CONFIG_PPC32 */ -extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; extern phys_addr_t __initial_memory_limit_addr; extern phys_addr_t total_memory; diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index bf60983a58c7..696f568253a0 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -703,6 +703,8 @@ static void __init early_init_mmu_global(void) * for use by the TLB miss code */ linear_map_top = memblock_end_of_DRAM(); + + ioremap_bot = IOREMAP_BASE; } static void __init early_mmu_set_memory_limit(void) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index ac5a267467fa..102901a19f3c 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -33,9 +33,6 @@ #include -unsigned long ioremap_bot; -EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ - extern char etext[], _stext[], _sinittext[], _einittext[]; void __iomem * diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 2b9078e1bc43..d865e053052d 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -98,9 +98,6 @@ unsigned long __pte_frag_nr; EXPORT_SYMBOL(__pte_frag_nr); unsigned long __pte_frag_size_shift; EXPORT_SYMBOL(__pte_frag_size_shift); -unsigned long ioremap_bot; -#else /* !CONFIG_PPC_BOOK3S_64 */ -unsigned long ioremap_bot = IOREMAP_BASE; #endif int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid) -- cgit v1.2.3 From f381d5711f091facd8847a54a2377cc0d1519df2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:17 +0000 Subject: powerpc/mm: Move ioremap functions out of pgtable_32/64.c Create ioremap_32.c and ioremap_64.c and move respective ioremap functions out of pgtable_32.c and pgtable_64.c In the meantime, fix a few comments and changes a printk() to pr_warn(). Also fix a few oversplitted lines. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b5c8b02ccefd4ede64c61b53cf64fb5dacb35740.1566309263.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/ioremap_32.c | 104 +++++++++++++++++++++++++++++++++++ arch/powerpc/mm/ioremap_64.c | 123 +++++++++++++++++++++++++++++++++++++++++ arch/powerpc/mm/pgtable_32.c | 99 --------------------------------- arch/powerpc/mm/pgtable_64.c | 128 +------------------------------------------ 5 files changed, 229 insertions(+), 227 deletions(-) create mode 100644 arch/powerpc/mm/ioremap_32.c create mode 100644 arch/powerpc/mm/ioremap_64.c diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 29c682fe9144..5e147986400d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -7,7 +7,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) obj-y := fault.o mem.o pgtable.o mmap.o \ init_$(BITS).o pgtable_$(BITS).o \ - pgtable-frag.o ioremap.o \ + pgtable-frag.o ioremap.o ioremap_$(BITS).o \ init-common.o mmu_context.o drmem.o obj-$(CONFIG_PPC_MMU_NOHASH) += nohash/ obj-$(CONFIG_PPC_BOOK3S_32) += book3s32/ diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c new file mode 100644 index 000000000000..fb43ba71aa54 --- /dev/null +++ b/arch/powerpc/mm/ioremap_32.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include + +void __iomem *ioremap_wt(phys_addr_t addr, unsigned long size) +{ + pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); + + return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wt); + +void __iomem * +__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) +{ + unsigned long v, i; + phys_addr_t p; + int err; + + /* + * Choose an address to map it to. + * Once the vmalloc system is running, we use it. + * Before then, we use space going down from IOREMAP_TOP + * (ioremap_bot records where we're up to). + */ + p = addr & PAGE_MASK; + size = PAGE_ALIGN(addr + size) - p; + + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (p < 16 * 1024 * 1024) + p += _ISA_MEM_BASE; + +#ifndef CONFIG_CRASH_DUMP + /* + * Don't allow anybody to remap normal RAM that we're using. + * mem_init() sets high_memory so only do the check after that. + */ + if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && + page_is_ram(__phys_to_pfn(p))) { + pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__, + (unsigned long long)p, __builtin_return_address(0)); + return NULL; + } +#endif + + if (size == 0) + return NULL; + + /* + * Is it already mapped? Perhaps overlapped by a previous + * mapping. + */ + v = p_block_mapped(p); + if (v) + goto out; + + if (slab_is_available()) { + struct vm_struct *area; + area = get_vm_area_caller(size, VM_IOREMAP, caller); + if (area == 0) + return NULL; + area->phys_addr = p; + v = (unsigned long)area->addr; + } else { + v = (ioremap_bot -= size); + } + + /* + * Should check if it is a candidate for a BAT mapping + */ + + err = 0; + for (i = 0; i < size && err == 0; i += PAGE_SIZE) + err = map_kernel_page(v + i, p + i, prot); + if (err) { + if (slab_is_available()) + vunmap((void *)v); + return NULL; + } + +out: + return (void __iomem *)(v + ((unsigned long)addr & ~PAGE_MASK)); +} + +void iounmap(volatile void __iomem *addr) +{ + /* + * If mapped by BATs then there is nothing to do. + * Calling vfree() generates a benign warning. + */ + if (v_block_mapped((unsigned long)addr)) + return; + + if (addr > high_memory && (unsigned long)addr < ioremap_bot) + vunmap((void *)(PAGE_MASK & (unsigned long)addr)); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c new file mode 100644 index 000000000000..57f3b096143c --- /dev/null +++ b/arch/powerpc/mm/ioremap_64.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, + pgprot_t prot, int nid) +{ + unsigned long i; + + for (i = 0; i < size; i += PAGE_SIZE) { + int err = map_kernel_page(ea + i, pa + i, prot); + if (err) { + if (slab_is_available()) + unmap_kernel_range(ea, size); + else + WARN_ON_ONCE(1); /* Should clean up */ + return err; + } + } + + return 0; +} + +/** + * Low level function to establish the page tables for an IO mapping + */ +void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) +{ + /* We don't support the 4K PFN hack with ioremap */ + if (pgprot_val(prot) & H_PAGE_4K_PFN) + return NULL; + + if ((ea + size) >= (void *)IOREMAP_END) { + pr_warn("Outside the supported range\n"); + return NULL; + } + + WARN_ON(pa & ~PAGE_MASK); + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE)) + return NULL; + + return (void __iomem *)ea; +} +EXPORT_SYMBOL(__ioremap_at); + +/** + * Low level function to tear down the page tables for an IO mapping. This is + * used for mappings that are manipulated manually, like partial unmapping of + * PCI IOs or ISA space. + */ +void __iounmap_at(void *ea, unsigned long size) +{ + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + unmap_kernel_range((unsigned long)ea, size); +} +EXPORT_SYMBOL(__iounmap_at); + +void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller) +{ + phys_addr_t paligned; + void __iomem *ret; + + /* + * Choose an address to map it to. Once the vmalloc system is running, + * we use it. Before that, we map using addresses going up from + * ioremap_bot. vmalloc will use the addresses from IOREMAP_BASE + * through ioremap_bot. + */ + paligned = addr & PAGE_MASK; + size = PAGE_ALIGN(addr + size) - paligned; + + if (size == 0 || paligned == 0) + return NULL; + + if (slab_is_available()) { + struct vm_struct *area; + + area = __get_vm_area_caller(size, VM_IOREMAP, ioremap_bot, + IOREMAP_END, caller); + if (area == NULL) + return NULL; + + area->phys_addr = paligned; + ret = __ioremap_at(paligned, area->addr, size, prot); + } else { + ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); + if (ret) + ioremap_bot += size; + } + + if (ret) + ret += addr & ~PAGE_MASK; + return ret; +} + +/* + * Unmap an IO region and remove it from vmalloc'd list. + * Access to IO memory should be serialized by driver. + */ +void iounmap(volatile void __iomem *token) +{ + void *addr; + + if (!slab_is_available()) + return; + + addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); + + if ((unsigned long)addr < ioremap_bot) { + pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); + return; + } + vunmap(addr); +} +EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 102901a19f3c..8ec5dfb65b2e 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -35,104 +34,6 @@ extern char etext[], _stext[], _sinittext[], _einittext[]; -void __iomem * -ioremap_wt(phys_addr_t addr, unsigned long size) -{ - pgprot_t prot = pgprot_cached_wthru(PAGE_KERNEL); - - return __ioremap_caller(addr, size, prot, __builtin_return_address(0)); -} -EXPORT_SYMBOL(ioremap_wt); - -void __iomem * -__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) -{ - unsigned long v, i; - phys_addr_t p; - int err; - - /* - * Choose an address to map it to. - * Once the vmalloc system is running, we use it. - * Before then, we use space going down from IOREMAP_TOP - * (ioremap_bot records where we're up to). - */ - p = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - p; - - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16*1024*1024) - p += _ISA_MEM_BASE; - -#ifndef CONFIG_CRASH_DUMP - /* - * Don't allow anybody to remap normal RAM that we're using. - * mem_init() sets high_memory so only do the check after that. - */ - if (slab_is_available() && p <= virt_to_phys(high_memory - 1) && - page_is_ram(__phys_to_pfn(p))) { - pr_warn("%s(): phys addr 0x%llx is RAM lr %ps\n", __func__, - (unsigned long long)p, __builtin_return_address(0)); - return NULL; - } -#endif - - if (size == 0) - return NULL; - - /* - * Is it already mapped? Perhaps overlapped by a previous - * mapping. - */ - v = p_block_mapped(p); - if (v) - goto out; - - if (slab_is_available()) { - struct vm_struct *area; - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (area == 0) - return NULL; - area->phys_addr = p; - v = (unsigned long) area->addr; - } else { - v = (ioremap_bot -= size); - } - - /* - * Should check if it is a candidate for a BAT mapping - */ - - err = 0; - for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_kernel_page(v + i, p + i, prot); - if (err) { - if (slab_is_available()) - vunmap((void *)v); - return NULL; - } - -out: - return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK)); -} - -void iounmap(volatile void __iomem *addr) -{ - /* - * If mapped by BATs then there is nothing to do. - * Calling vfree() generates a benign warning. - */ - if (v_block_mapped((unsigned long)addr)) - return; - - if (addr > high_memory && (unsigned long) addr < ioremap_bot) - vunmap((void *) (PAGE_MASK & (unsigned long)addr)); -} -EXPORT_SYMBOL(iounmap); - static void __init *early_alloc_pgtable(unsigned long size) { void *ptr = memblock_alloc(size, size); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index d865e053052d..e78832dce7bb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * This file contains ioremap and related functions for 64-bit machines. + * This file contains pgtable related functions for 64-bit machines. * * Derived from arch/ppc64/mm/init.c * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -100,131 +99,6 @@ unsigned long __pte_frag_size_shift; EXPORT_SYMBOL(__pte_frag_size_shift); #endif -int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid) -{ - unsigned long i; - - for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); - if (err) { - if (slab_is_available()) - unmap_kernel_range(ea, size); - else - WARN_ON_ONCE(1); /* Should clean up */ - return err; - } - } - - return 0; -} - -/** - * __ioremap_at - Low level function to establish the page tables - * for an IO mapping - */ -void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) -{ - /* We don't support the 4K PFN hack with ioremap */ - if (pgprot_val(prot) & H_PAGE_4K_PFN) - return NULL; - - if ((ea + size) >= (void *)IOREMAP_END) { - pr_warn("Outside the supported range\n"); - return NULL; - } - - WARN_ON(pa & ~PAGE_MASK); - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE)) - return NULL; - - return (void __iomem *)ea; -} - -/** - * __iounmap_from - Low level function to tear down the page tables - * for an IO mapping. This is used for mappings that - * are manipulated manually, like partial unmapping of - * PCI IOs or ISA space. - */ -void __iounmap_at(void *ea, unsigned long size) -{ - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - unmap_kernel_range((unsigned long)ea, size); -} - -void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, - pgprot_t prot, void *caller) -{ - phys_addr_t paligned; - void __iomem *ret; - - /* - * Choose an address to map it to. - * Once the imalloc system is running, we use it. - * Before that, we map using addresses going - * up from ioremap_bot. imalloc will use - * the addresses from ioremap_bot through - * IMALLOC_END - * - */ - paligned = addr & PAGE_MASK; - size = PAGE_ALIGN(addr + size) - paligned; - - if ((size == 0) || (paligned == 0)) - return NULL; - - if (slab_is_available()) { - struct vm_struct *area; - - area = __get_vm_area_caller(size, VM_IOREMAP, - ioremap_bot, IOREMAP_END, - caller); - if (area == NULL) - return NULL; - - area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, prot); - } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); - if (ret) - ioremap_bot += size; - } - - if (ret) - ret += addr & ~PAGE_MASK; - return ret; -} - -/* - * Unmap an IO region and remove it from imalloc'd list. - * Access to IO memory should be serialized by driver. - */ -void iounmap(volatile void __iomem *token) -{ - void *addr; - - if (!slab_is_available()) - return; - - addr = (void *) ((unsigned long __force) - PCI_FIX_ADDR(token) & PAGE_MASK); - if ((unsigned long)addr < ioremap_bot) { - printk(KERN_WARNING "Attempt to iounmap early bolted mapping" - " at 0x%p\n", addr); - return; - } - vunmap(addr); -} - -EXPORT_SYMBOL(__ioremap_at); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(__iounmap_at); - #ifndef __PAGETABLE_PUD_FOLDED /* 4 level page table */ struct page *pgd_page(pgd_t pgd) -- cgit v1.2.3 From 191e42063a7241e5c3a1d1f36896a20b147517e9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:18 +0000 Subject: powerpc/mm: refactor ioremap_range() and use ioremap_page_range() book3s64's ioremap_range() is almost same as fallback ioremap_range(), except that it calls radix__ioremap_range() when radix is enabled. radix__ioremap_range() is also very similar to the other ones, expect that it calls ioremap_page_range when slab is available. PPC32 __ioremap_caller() have a loop doing the same thing as ioremap_range() so use it on PPC32 as well. Lets keep only one version of ioremap_range() which calls ioremap_page_range() on all platforms when slab is available. At the same time, drop the nid parameter which is not used. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4b1dca7096b01823b101be7338983578641547f1.1566309263.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/64/radix.h | 3 --- arch/powerpc/include/asm/io.h | 2 ++ arch/powerpc/mm/book3s64/pgtable.c | 21 --------------------- arch/powerpc/mm/book3s64/radix_pgtable.c | 20 -------------------- arch/powerpc/mm/ioremap.c | 24 ++++++++++++++++++++++++ arch/powerpc/mm/ioremap_32.c | 6 ++---- arch/powerpc/mm/ioremap_64.c | 21 +-------------------- 7 files changed, 29 insertions(+), 68 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index e04a839cb5b9..574eca33f893 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -266,9 +266,6 @@ extern void radix__vmemmap_remove_mapping(unsigned long start, extern int radix__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, unsigned int psz); -extern int radix__ioremap_range(unsigned long ea, phys_addr_t pa, - unsigned long size, pgprot_t prot, int nid); - static inline unsigned long radix__get_tree_size(void) { unsigned long rts_field; diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 8e65ba59f06a..8e00d95f9600 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -722,6 +722,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); extern void iounmap(volatile void __iomem *addr); +int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); + extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 7d0e0d0d22c4..4c8bed856533 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -446,24 +446,3 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } - -int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot, int nid) -{ - unsigned long i; - - if (radix_enabled()) - return radix__ioremap_range(ea, pa, size, prot, nid); - - for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); - if (err) { - if (slab_is_available()) - unmap_kernel_range(ea, size); - else - WARN_ON_ONCE(1); /* Should clean up */ - return err; - } - } - - return 0; -} diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 2204d8eeb784..5bab67fe96e0 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1218,26 +1218,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) return 1; } -int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, - pgprot_t prot, int nid) -{ - if (likely(slab_is_available())) { - int err = ioremap_page_range(ea, ea + size, pa, prot); - if (err) - unmap_kernel_range(ea, size); - return err; - } else { - unsigned long i; - - for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); - if (WARN_ON_ONCE(err)) /* Should clean up */ - return err; - } - return 0; - } -} - int __init arch_ioremap_p4d_supported(void) { return 0; diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index eaf5f8a4a63f..50ee6544d0b7 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include +#include #include unsigned long ioremap_bot; @@ -56,3 +58,25 @@ void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long f return __ioremap_caller(addr, size, pte_pgprot(pte), caller); } EXPORT_SYMBOL(ioremap_prot); + +int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot) +{ + unsigned long i; + + if (slab_is_available()) { + int err = ioremap_page_range(ea, ea + size, pa, prot); + + if (err) + unmap_kernel_range(ea, size); + return err; + } + + for (i = 0; i < size; i += PAGE_SIZE) { + int err = map_kernel_page(ea + i, pa + i, prot); + + if (WARN_ON_ONCE(err)) /* Should clean up */ + return err; + } + + return 0; +} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index fb43ba71aa54..85b90a62e084 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -17,7 +17,7 @@ EXPORT_SYMBOL(ioremap_wt); void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { - unsigned long v, i; + unsigned long v; phys_addr_t p; int err; @@ -76,9 +76,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call * Should check if it is a candidate for a BAT mapping */ - err = 0; - for (i = 0; i < size && err == 0; i += PAGE_SIZE) - err = map_kernel_page(v + i, p + i, prot); + err = ioremap_range((unsigned long)v, p, size, prot); if (err) { if (slab_is_available()) vunmap((void *)v); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 57f3b096143c..d132ce1e538d 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -4,25 +4,6 @@ #include #include -int __weak ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, - pgprot_t prot, int nid) -{ - unsigned long i; - - for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); - if (err) { - if (slab_is_available()) - unmap_kernel_range(ea, size); - else - WARN_ON_ONCE(1); /* Should clean up */ - return err; - } - } - - return 0; -} - /** * Low level function to establish the page tables for an IO mapping */ @@ -41,7 +22,7 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_ WARN_ON(((unsigned long)ea) & ~PAGE_MASK); WARN_ON(size & ~PAGE_MASK); - if (ioremap_range((unsigned long)ea, pa, size, prot, NUMA_NO_NODE)) + if (ioremap_range((unsigned long)ea, pa, size, prot)) return NULL; return (void __iomem *)ea; -- cgit v1.2.3 From 4a45b7460cf458012a6930f675e141256b81dcf4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:19 +0000 Subject: powerpc/mm: refactor ioremap vm area setup. PPC32 and PPC64 are doing the same once SLAB is available. Create a do_ioremap() function that calls get_vm_area and do the mapping. For PPC64, we add the 4K PFN hack sanity check to __ioremap_caller() in order to avoid using __ioremap_at(). Other checks in __ioremap_at() are irrelevant for __ioremap_caller(). On PPC64, VM area is allocated in the range [ioremap_bot ; IOREMAP_END] On PPC32, VM area is allocated in the range [VMALLOC_START ; VMALLOC_END] Lets define IOREMAP_START is ioremap_bot for PPC64, and alias IOREMAP_START/END to VMALLOC_START/END on PPC32 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/42e7e36ad32e0fdf76692426cc642799c9f689b8.1566309263.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/pgtable.h | 4 ++++ arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/io.h | 2 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 4 ++++ arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/powerpc/mm/ioremap.c | 20 ++++++++++++++++++++ arch/powerpc/mm/ioremap_32.c | 15 ++++----------- arch/powerpc/mm/ioremap_64.c | 17 +++++++---------- 8 files changed, 43 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index aa1bc5f8da90..331a29a501a1 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -165,6 +165,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #define IOREMAP_TOP KVIRT_TOP #endif +/* PPC32 shares vmalloc area with ioremap */ +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END + /* * Just any arbitrary offset to the start of the vmalloc VM area: the * current 16MB value just means that there will be a 64MB "hole" after the diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 03c9a14dd902..b01624e5c467 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -316,6 +316,7 @@ extern unsigned long pci_io_base; #define PHB_IO_BASE (ISA_IO_END) #define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) +#define IOREMAP_START (ioremap_bot) #define IOREMAP_END (KERN_IO_END) /* Advertise special mapping type for AGP */ diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 8e00d95f9600..dc529ea0fffa 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -723,6 +723,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); extern void iounmap(volatile void __iomem *addr); int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); +void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, + pgprot_t prot, void *caller); extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 7ce2a7c9fade..3e1a4c1e40f0 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -93,6 +93,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); #define IOREMAP_TOP KVIRT_TOP #endif +/* PPC32 shares vmalloc area with ioremap */ +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END + /* * Just any arbitrary offset to the start of the vmalloc VM area: the * current 16MB value just means that there will be a 64MB "hole" after the diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index b9f66cf15c31..9a33b8bd842d 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -53,6 +53,7 @@ #define PHB_IO_BASE (ISA_IO_END) #define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) +#define IOREMAP_START (ioremap_bot) #define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 50ee6544d0b7..57630325846c 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -80,3 +80,23 @@ int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t return 0; } + +void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, + pgprot_t prot, void *caller) +{ + struct vm_struct *area; + int ret; + + area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); + if (area == NULL) + return NULL; + + area->phys_addr = pa; + ret = ioremap_range((unsigned long)area->addr, pa, size, prot); + if (!ret) + return (void __iomem *)area->addr + offset; + + free_vm_area(area); + + return NULL; +} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index 85b90a62e084..fcf343dbf2bf 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -18,7 +18,7 @@ void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { unsigned long v; - phys_addr_t p; + phys_addr_t p, offset; int err; /* @@ -28,6 +28,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call * (ioremap_bot records where we're up to). */ p = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; size = PAGE_ALIGN(addr + size) - p; /* @@ -62,12 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call goto out; if (slab_is_available()) { - struct vm_struct *area; - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (area == 0) - return NULL; - area->phys_addr = p; - v = (unsigned long)area->addr; + return do_ioremap(p, offset, size, prot, caller); } else { v = (ioremap_bot -= size); } @@ -77,11 +73,8 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call */ err = ioremap_range((unsigned long)v, p, size, prot); - if (err) { - if (slab_is_available()) - vunmap((void *)v); + if (err) return NULL; - } out: return (void __iomem *)(v + ((unsigned long)addr & ~PAGE_MASK)); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index d132ce1e538d..e37b68b7f0e8 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -46,9 +46,13 @@ EXPORT_SYMBOL(__iounmap_at); void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { - phys_addr_t paligned; + phys_addr_t paligned, offset; void __iomem *ret; + /* We don't support the 4K PFN hack with ioremap */ + if (pgprot_val(prot) & H_PAGE_4K_PFN) + return NULL; + /* * Choose an address to map it to. Once the vmalloc system is running, * we use it. Before that, we map using addresses going up from @@ -56,21 +60,14 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, * through ioremap_bot. */ paligned = addr & PAGE_MASK; + offset = addr & ~PAGE_MASK; size = PAGE_ALIGN(addr + size) - paligned; if (size == 0 || paligned == 0) return NULL; if (slab_is_available()) { - struct vm_struct *area; - - area = __get_vm_area_caller(size, VM_IOREMAP, ioremap_bot, - IOREMAP_END, caller); - if (area == NULL) - return NULL; - - area->phys_addr = paligned; - ret = __ioremap_at(paligned, area->addr, size, prot); + return do_ioremap(paligned, offset, size, prot, caller); } else { ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); if (ret) -- cgit v1.2.3 From 163918fc5741d755cf9d477ebfcb761f09b82982 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:07:20 +0000 Subject: powerpc/mm: split out early ioremap path. ioremap does things differently depending on whether SLAB is available or not at different levels. Try to separate the early path from the beginning. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/3acd2dbe04b04f111475e7a59f2b6f2ab9b95ab6.1566309263.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/io.h | 3 ++- arch/powerpc/mm/ioremap.c | 17 +++++++---------- arch/powerpc/mm/ioremap_32.c | 13 +++++-------- arch/powerpc/mm/ioremap_64.c | 30 +++++++++++++++++++++--------- 4 files changed, 35 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index dc529ea0fffa..a63ec938636d 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -722,7 +722,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); extern void iounmap(volatile void __iomem *addr); -int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); +int early_ioremap_range(unsigned long ea, phys_addr_t pa, + unsigned long size, pgprot_t prot); void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, pgprot_t prot, void *caller); diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 57630325846c..fc669643ce6a 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -59,18 +59,11 @@ void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long f } EXPORT_SYMBOL(ioremap_prot); -int ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot) +int early_ioremap_range(unsigned long ea, phys_addr_t pa, + unsigned long size, pgprot_t prot) { unsigned long i; - if (slab_is_available()) { - int err = ioremap_page_range(ea, ea + size, pa, prot); - - if (err) - unmap_kernel_range(ea, size); - return err; - } - for (i = 0; i < size; i += PAGE_SIZE) { int err = map_kernel_page(ea + i, pa + i, prot); @@ -86,16 +79,20 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, { struct vm_struct *area; int ret; + unsigned long va; area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); if (area == NULL) return NULL; area->phys_addr = pa; - ret = ioremap_range((unsigned long)area->addr, pa, size, prot); + va = (unsigned long)area->addr; + + ret = ioremap_page_range(va, va + size, pa, prot); if (!ret) return (void __iomem *)area->addr + offset; + unmap_kernel_range(va, size); free_vm_area(area); return NULL; diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index fcf343dbf2bf..f36121f25243 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -60,24 +60,21 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call */ v = p_block_mapped(p); if (v) - goto out; + return (void __iomem *)v + offset; - if (slab_is_available()) { + if (slab_is_available()) return do_ioremap(p, offset, size, prot, caller); - } else { - v = (ioremap_bot -= size); - } /* * Should check if it is a candidate for a BAT mapping */ - err = ioremap_range((unsigned long)v, p, size, prot); + err = early_ioremap_range(ioremap_bot - size, p, size, prot); if (err) return NULL; + ioremap_bot -= size; -out: - return (void __iomem *)(v + ((unsigned long)addr & ~PAGE_MASK)); + return (void __iomem *)ioremap_bot + offset; } void iounmap(volatile void __iomem *addr) diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index e37b68b7f0e8..fd29e51700cd 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -9,6 +9,9 @@ */ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) { + int ret; + unsigned long va = (unsigned long)ea; + /* We don't support the 4K PFN hack with ioremap */ if (pgprot_val(prot) & H_PAGE_4K_PFN) return NULL; @@ -22,7 +25,15 @@ void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_ WARN_ON(((unsigned long)ea) & ~PAGE_MASK); WARN_ON(size & ~PAGE_MASK); - if (ioremap_range((unsigned long)ea, pa, size, prot)) + if (slab_is_available()) { + ret = ioremap_page_range(va, va + size, pa, prot); + if (ret) + unmap_kernel_range(va, size); + } else { + ret = early_ioremap_range(va, pa, size, prot); + } + + if (ret) return NULL; return (void __iomem *)ea; @@ -48,6 +59,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, { phys_addr_t paligned, offset; void __iomem *ret; + int err; /* We don't support the 4K PFN hack with ioremap */ if (pgprot_val(prot) & H_PAGE_4K_PFN) @@ -66,16 +78,16 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, if (size == 0 || paligned == 0) return NULL; - if (slab_is_available()) { + if (slab_is_available()) return do_ioremap(paligned, offset, size, prot, caller); - } else { - ret = __ioremap_at(paligned, (void *)ioremap_bot, size, prot); - if (ret) - ioremap_bot += size; - } - if (ret) - ret += addr & ~PAGE_MASK; + err = early_ioremap_range(ioremap_bot, paligned, size, prot); + if (err) + return NULL; + + ret = (void __iomem *)ioremap_bot + offset; + ioremap_bot += size; + return ret; } -- cgit v1.2.3 From c691b4b83b6a348f7b9d13c36916e73c2e1d85e4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:34:12 +0000 Subject: powerpc: rewrite LOAD_REG_IMMEDIATE() as an intelligent macro Today LOAD_REG_IMMEDIATE() is a basic #define which loads all parts on a value into a register, including the parts that are NUL. This means always 2 instructions on PPC32 and always 5 instructions on PPC64. And those instructions cannot run in parallele as they are updating the same register. Ex: LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) in head_64.S results in: 3c 20 00 00 lis r1,0 60 21 00 00 ori r1,r1,0 78 21 07 c6 rldicr r1,r1,32,31 64 21 00 00 oris r1,r1,0 60 21 40 00 ori r1,r1,16384 Rewrite LOAD_REG_IMMEDIATE() with GAS macro in order to skip the parts that are NUL. Rename existing LOAD_REG_IMMEDIATE() as LOAD_REG_IMMEDIATE_SYM() and use that one for loading value of symbols which are not known at compile time. Now LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) in head_64.S results in: 38 20 40 00 li r1,16384 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d60ce8dd3a383c7adbfc322bf1d53d81724a6000.1566311636.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/ppc_asm.h | 42 +++++++++++++++++++++++++++++++----- arch/powerpc/kernel/exceptions-64e.S | 10 ++++----- arch/powerpc/kernel/head_64.S | 2 +- 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index e0637730a8e7..20a00209c965 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -311,13 +311,43 @@ n: addis reg,reg,(name - 0b)@ha; \ addi reg,reg,(name - 0b)@l; -#ifdef __powerpc64__ -#ifdef HAVE_AS_ATHIGH +#if defined(__powerpc64__) && defined(HAVE_AS_ATHIGH) #define __AS_ATHIGH high #else #define __AS_ATHIGH h #endif -#define LOAD_REG_IMMEDIATE(reg,expr) \ + +.macro __LOAD_REG_IMMEDIATE_32 r, x + .if (\x) >= 0x8000 || (\x) < -0x8000 + lis \r, (\x)@__AS_ATHIGH + .if (\x) & 0xffff != 0 + ori \r, \r, (\x)@l + .endif + .else + li \r, (\x)@l + .endif +.endm + +.macro __LOAD_REG_IMMEDIATE r, x + .if (\x) >= 0x80000000 || (\x) < -0x80000000 + __LOAD_REG_IMMEDIATE_32 \r, (\x) >> 32 + sldi \r, \r, 32 + .if (\x) & 0xffff0000 != 0 + oris \r, \r, (\x)@__AS_ATHIGH + .endif + .if (\x) & 0xffff != 0 + ori \r, \r, (\x)@l + .endif + .else + __LOAD_REG_IMMEDIATE_32 \r, \x + .endif +.endm + +#ifdef __powerpc64__ + +#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr + +#define LOAD_REG_IMMEDIATE_SYM(reg,expr) \ lis reg,(expr)@highest; \ ori reg,reg,(expr)@higher; \ rldicr reg,reg,32,31; \ @@ -335,11 +365,13 @@ n: #else /* 32-bit */ -#define LOAD_REG_IMMEDIATE(reg,expr) \ +#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE_32 reg, expr + +#define LOAD_REG_IMMEDIATE_SYM(reg,expr) \ lis reg,(expr)@ha; \ addi reg,reg,(expr)@l; -#define LOAD_REG_ADDR(reg,name) LOAD_REG_IMMEDIATE(reg, name) +#define LOAD_REG_ADDR(reg,name) LOAD_REG_IMMEDIATE_SYM(reg, name) #define LOAD_REG_ADDRBASE(reg, name) lis reg,name@ha #define ADDROFF(name) name@l diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1cfb3da4a84a..898aae6da167 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -751,8 +751,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) #else - LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE(r15,__end_interrupts) + LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e) + LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts) #endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 @@ -821,8 +821,8 @@ kernel_dbg_exc: ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) #else - LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE(r15,__end_interrupts) + LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e) + LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts) #endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 @@ -1449,7 +1449,7 @@ a2_tlbinit_code_start: a2_tlbinit_after_linear_map: /* Now we branch the new virtual address mapped by this entry */ - LOAD_REG_IMMEDIATE(r3,1f) + LOAD_REG_IMMEDIATE_SYM(r3,1f) mtctr r3 bctr diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 91d297e696dd..1fd44761e997 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -635,7 +635,7 @@ __after_prom_start: sub r5,r5,r11 #else /* just copy interrupts */ - LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) + LOAD_REG_IMMEDIATE_SYM(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) #endif b 5f 3: -- cgit v1.2.3 From ba18025fb03306ccdf3557a1e7b8a5b39b474872 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:34:13 +0000 Subject: powerpc/32: replace LOAD_MSR_KERNEL() by LOAD_REG_IMMEDIATE() LOAD_MSR_KERNEL() and LOAD_REG_IMMEDIATE() are doing the same thing in the same way. Drop LOAD_MSR_KERNEL() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8f04a6df0bc8949517fd8236d50c15008ccf9231.1566311636.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 18 +++++++++--------- arch/powerpc/kernel/head_32.h | 21 ++++----------------- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 54fab22c9a43..972b05504a0a 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -230,7 +230,7 @@ transfer_to_handler_cont: */ lis r12,reenable_mmu@h ori r12,r12,reenable_mmu@l - LOAD_MSR_KERNEL(r0, MSR_KERNEL) + LOAD_REG_IMMEDIATE(r0, MSR_KERNEL) mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r0 SYNC @@ -304,7 +304,7 @@ stack_ovf: addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD lis r9,StackOverflow@ha addi r9,r9,StackOverflow@l - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 #endif @@ -324,7 +324,7 @@ trace_syscall_entry_irq_off: bl trace_hardirqs_on /* Now enable for real */ - LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) mtmsr r10 REST_GPR(0, r1) @@ -394,7 +394,7 @@ ret_from_syscall: #endif mr r6,r3 /* disable interrupts so current_thread_info()->flags can't change */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ SYNC MTMSRD(r10) @@ -824,7 +824,7 @@ ret_from_except: * can't change between when we test it and when we return * from the interrupt. */ /* Note: We don't bother telling lockdep about it */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC /* Some chip revs have problems here... */ MTMSRD(r10) /* disable interrupts */ @@ -991,7 +991,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) * can restart the exception exit path at the label * exc_exit_restart below. -- paulus */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) SYNC MTMSRD(r10) /* clear the RI bit */ .globl exc_exit_restart @@ -1066,7 +1066,7 @@ exc_exit_restart_end: REST_NVGPRS(r1); \ lwz r3,_MSR(r1); \ andi. r3,r3,MSR_PR; \ - LOAD_MSR_KERNEL(r10,MSR_KERNEL); \ + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL); \ bne user_exc_return; \ lwz r0,GPR0(r1); \ lwz r2,GPR2(r1); \ @@ -1236,7 +1236,7 @@ recheck: * neither. Those disable/enable cycles used to peek at * TI_FLAGS aren't advertised. */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC MTMSRD(r10) /* disable interrupts */ lwz r9,TI_FLAGS(r2) @@ -1329,7 +1329,7 @@ _GLOBAL(enter_rtas) lwz r4,RTASBASE(r4) mfmsr r9 stw r9,8(r1) - LOAD_MSR_KERNEL(r0,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r0,MSR_KERNEL) SYNC /* disable interrupts so SRR0/1 */ MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 4a692553651f..8abc7783dbe5 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -4,19 +4,6 @@ #include /* for STACK_FRAME_REGS_MARKER */ -/* - * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE. - */ -.macro __LOAD_MSR_KERNEL r, x -.if \x >= 0x8000 - lis \r, (\x)@h - ori \r, \r, (\x)@l -.else - li \r, (\x) -.endif -.endm -#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x - /* * Exception entry code. This code runs with address translation * turned off, i.e. using physical addresses. @@ -92,7 +79,7 @@ #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #else - LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ MTMSRD(r10) /* (except for mach check in rtas) */ #endif lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ @@ -140,10 +127,10 @@ * otherwise we might risk taking an interrupt before we tell lockdep * they are enabled. */ - LOAD_MSR_KERNEL(r10, MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL) rlwimi r10, r9, 0, MSR_EE #else - LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) #endif #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 @@ -187,7 +174,7 @@ label: #define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ li r10,trap; \ stw r10,_TRAP(r11); \ - LOAD_MSR_KERNEL(r10, msr); \ + LOAD_REG_IMMEDIATE(r10, msr); \ bl tfer; \ .long hdlr; \ .long ret -- cgit v1.2.3 From d7fb5b18a540efaf05da2b980fc11d50ba775677 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 20 Aug 2019 14:34:14 +0000 Subject: powerpc/64: optimise LOAD_REG_IMMEDIATE_SYM() Optimise LOAD_REG_IMMEDIATE_SYM() using a temporary register to parallelise operations. It reduces the path from 5 to 3 instructions. Suggested-by: Segher Boessenkool Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/bad41ed02531bb0382420cbab50a0d7153b71767.1566311636.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/ppc_asm.h | 12 ++++++------ arch/powerpc/kernel/exceptions-64e.S | 22 +++++++++++++--------- arch/powerpc/kernel/head_64.S | 2 +- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 20a00209c965..dd3b191bdcea 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -347,12 +347,12 @@ n: #define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr -#define LOAD_REG_IMMEDIATE_SYM(reg,expr) \ - lis reg,(expr)@highest; \ - ori reg,reg,(expr)@higher; \ - rldicr reg,reg,32,31; \ - oris reg,reg,(expr)@__AS_ATHIGH; \ - ori reg,reg,(expr)@l; +#define LOAD_REG_IMMEDIATE_SYM(reg, tmp, expr) \ + lis tmp, (expr)@highest; \ + lis reg, (expr)@__AS_ATHIGH; \ + ori tmp, tmp, (expr)@higher; \ + ori reg, reg, (expr)@l; \ + rldimi reg, tmp, 32, 0 #define LOAD_REG_ADDR(reg,name) \ ld reg,name@got(r2) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 898aae6da167..829950b96d29 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -750,12 +750,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) ld r15,PACATOC(r13) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) -#else - LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts) -#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 +#else + LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e) + cmpld cr0, r10, r14 + LOAD_REG_IMMEDIATE_SYM(r14, r15, __end_interrupts) + cmpld cr1, r10, r14 +#endif blt+ cr0,1f bge+ cr1,1f @@ -820,12 +822,14 @@ kernel_dbg_exc: ld r15,PACATOC(r13) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) -#else - LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts) -#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 +#else + LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e) + cmpld cr0, r10, r14 + LOAD_REG_IMMEDIATE_SYM(r14, r15,__end_interrupts) + cmpld cr1, r10, r14 +#endif blt+ cr0,1f bge+ cr1,1f @@ -1449,7 +1453,7 @@ a2_tlbinit_code_start: a2_tlbinit_after_linear_map: /* Now we branch the new virtual address mapped by this entry */ - LOAD_REG_IMMEDIATE_SYM(r3,1f) + LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f) mtctr r3 bctr diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 1fd44761e997..0f2d61af47cc 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -635,7 +635,7 @@ __after_prom_start: sub r5,r5,r11 #else /* just copy interrupts */ - LOAD_REG_IMMEDIATE_SYM(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) + LOAD_REG_IMMEDIATE_SYM(r5, r11, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) #endif b 5f 3: -- cgit v1.2.3 From d57b78353a99f5d813248954dd7d0527a01751ac Mon Sep 17 00:00:00 2001 From: "Christopher M. Riedl" Date: Mon, 12 Aug 2019 22:13:12 -0500 Subject: powerpc/spinlocks: Refactor SHARED_PROCESSOR Determining if a processor is in shared processor mode is not a constant so don't hide it behind a #define. Signed-off-by: Christopher M. Riedl Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190813031314.1828-2-cmr@informatik.wtf --- arch/powerpc/include/asm/spinlock.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index a47f827bc5f1..e9c60fbcc8fe 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -101,15 +101,27 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) #if defined(CONFIG_PPC_SPLPAR) /* We only yield to the hypervisor if we are in shared processor mode */ -#define SHARED_PROCESSOR (lppaca_shared_proc(local_paca->lppaca_ptr)) extern void __spin_yield(arch_spinlock_t *lock); extern void __rw_yield(arch_rwlock_t *lock); #else /* SPLPAR */ #define __spin_yield(x) barrier() #define __rw_yield(x) barrier() -#define SHARED_PROCESSOR 0 #endif +static inline bool is_shared_processor(void) +{ +/* + * LPPACA is only available on Pseries so guard anything LPPACA related to + * allow other platforms (which include this common header) to compile. + */ +#ifdef CONFIG_PPC_PSERIES + return (IS_ENABLED(CONFIG_PPC_SPLPAR) && + lppaca_shared_proc(local_paca->lppaca_ptr)); +#else + return false; +#endif +} + static inline void arch_spin_lock(arch_spinlock_t *lock) { while (1) { @@ -117,7 +129,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) break; do { HMT_low(); - if (SHARED_PROCESSOR) + if (is_shared_processor()) __spin_yield(lock); } while (unlikely(lock->slock != 0)); HMT_medium(); @@ -136,7 +148,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) local_irq_restore(flags); do { HMT_low(); - if (SHARED_PROCESSOR) + if (is_shared_processor()) __spin_yield(lock); } while (unlikely(lock->slock != 0)); HMT_medium(); @@ -226,7 +238,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw) break; do { HMT_low(); - if (SHARED_PROCESSOR) + if (is_shared_processor()) __rw_yield(rw); } while (unlikely(rw->lock < 0)); HMT_medium(); @@ -240,7 +252,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw) break; do { HMT_low(); - if (SHARED_PROCESSOR) + if (is_shared_processor()) __rw_yield(rw); } while (unlikely(rw->lock != 0)); HMT_medium(); -- cgit v1.2.3 From 31391ff7ea1ef557a804475436501f33ff0ead95 Mon Sep 17 00:00:00 2001 From: "Christopher M. Riedl" Date: Mon, 12 Aug 2019 22:13:13 -0500 Subject: powerpc/spinlocks: Rename SPLPAR-only spinlocks The __rw_yield and __spin_yield locks only pertain to SPLPAR mode. Rename them to make this relationship obvious. Signed-off-by: Christopher M. Riedl Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190813031314.1828-3-cmr@informatik.wtf --- arch/powerpc/include/asm/spinlock.h | 6 ++++-- arch/powerpc/lib/locks.c | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index e9c60fbcc8fe..0d04d468f660 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -101,8 +101,10 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) #if defined(CONFIG_PPC_SPLPAR) /* We only yield to the hypervisor if we are in shared processor mode */ -extern void __spin_yield(arch_spinlock_t *lock); -extern void __rw_yield(arch_rwlock_t *lock); +void splpar_spin_yield(arch_spinlock_t *lock); +void splpar_rw_yield(arch_rwlock_t *lock); +#define __spin_yield(x) splpar_spin_yield(x) +#define __rw_yield(x) splpar_rw_yield(x) #else /* SPLPAR */ #define __spin_yield(x) barrier() #define __rw_yield(x) barrier() diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c index 6550b9e5ce5f..6440d5943c00 100644 --- a/arch/powerpc/lib/locks.c +++ b/arch/powerpc/lib/locks.c @@ -18,7 +18,7 @@ #include #include -void __spin_yield(arch_spinlock_t *lock) +void splpar_spin_yield(arch_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -36,14 +36,14 @@ void __spin_yield(arch_spinlock_t *lock) plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(holder_cpu), yield_count); } -EXPORT_SYMBOL_GPL(__spin_yield); +EXPORT_SYMBOL_GPL(splpar_spin_yield); /* * Waiting for a read lock or a write lock on a rwlock... * This turns out to be the same for read and write locks, since * we only know the holder if it is write-locked. */ -void __rw_yield(arch_rwlock_t *rw) +void splpar_rw_yield(arch_rwlock_t *rw) { int lock_value; unsigned int holder_cpu, yield_count; -- cgit v1.2.3 From 405efc5980f2590a9520dc66cfe295456b8c9818 Mon Sep 17 00:00:00 2001 From: "Christopher M. Riedl" Date: Mon, 12 Aug 2019 22:13:14 -0500 Subject: powerpc/spinlocks: Fix oops in __spin_yield() on bare metal Booting w/ppc64le_defconfig + CONFIG_PREEMPT on bare metal results in the oops below due to calling into __spin_yield() when not running in an SPLPAR, which means lppaca pointers are NULL. We fixed a similar case previously in commit a6201da34ff9 ("powerpc: Fix oops due to bad access of lppaca on bare metal"), by adding SPLPAR checks in lppaca_shared_proc(). However when PREEMPT is enabled we can call __spin_yield() directly from arch_spin_yield(). To fix it add spin_yield() and rw_yield() which check that shared-processor LPAR is enabled before calling the SPLPAR-only implementation of each. BUG: Kernel NULL pointer dereference at 0x00000100 Faulting instruction address: 0xc000000000097f88 Oops: Kernel access of bad area, sig: 7 [#1] LE PAGE_SIZE=64K MMU=Radix MMU=Hash PREEMPT SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: CPU: 0 PID: 2 Comm: kthreadd Not tainted 5.2.0-rc6-00491-g249155c20f9b #28 NIP: c000000000097f88 LR: c000000000c07a88 CTR: c00000000015ca10 REGS: c0000000727079f0 TRAP: 0300 Not tainted (5.2.0-rc6-00491-g249155c20f9b) MSR: 9000000002009033 CR: 84000424 XER: 20040000 CFAR: c000000000c07a84 DAR: 0000000000000100 DSISR: 00080000 IRQMASK: 1 GPR00: c000000000c07a88 c000000072707c80 c000000001546300 c00000007be38a80 GPR04: c0000000726f0c00 0000000000000002 c00000007279c980 0000000000000100 GPR08: c000000001581b78 0000000080000001 0000000000000008 c00000007279c9b0 GPR12: 0000000000000000 c000000001730000 c000000000142558 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: c00000007be38a80 c000000000c002f4 0000000000000000 0000000000000000 GPR28: c000000072221a00 c0000000726c2600 c00000007be38a80 c00000007be38a80 NIP [c000000000097f88] __spin_yield+0x48/0xa0 LR [c000000000c07a88] __raw_spin_lock+0xb8/0xc0 Call Trace: [c000000072707c80] [c000000072221a00] 0xc000000072221a00 (unreliable) [c000000072707cb0] [c000000000bffb0c] __schedule+0xbc/0x850 [c000000072707d70] [c000000000c002f4] schedule+0x54/0x130 [c000000072707da0] [c0000000001427dc] kthreadd+0x28c/0x2b0 [c000000072707e20] [c00000000000c1cc] ret_from_kernel_thread+0x5c/0x70 Instruction dump: 4d9e0020 552a043e 210a07ff 79080fe0 0b080000 3d020004 3908b878 794a1f24 e8e80000 7ce7502a e8e70000 38e70100 <7ca03c2c> 70a70001 78a50020 4d820020 ---[ end trace 474d6b2b8fc5cb7e ]--- Fixes: 499dcd41378e ("powerpc/64s: Allocate LPPACAs individually") Signed-off-by: Christopher M. Riedl [mpe: Reword change log a bit] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190813031314.1828-4-cmr@informatik.wtf --- arch/powerpc/include/asm/spinlock.h | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 0d04d468f660..e9a960e28f3c 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -103,11 +103,9 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) /* We only yield to the hypervisor if we are in shared processor mode */ void splpar_spin_yield(arch_spinlock_t *lock); void splpar_rw_yield(arch_rwlock_t *lock); -#define __spin_yield(x) splpar_spin_yield(x) -#define __rw_yield(x) splpar_rw_yield(x) #else /* SPLPAR */ -#define __spin_yield(x) barrier() -#define __rw_yield(x) barrier() +static inline void splpar_spin_yield(arch_spinlock_t *lock) {}; +static inline void splpar_rw_yield(arch_rwlock_t *lock) {}; #endif static inline bool is_shared_processor(void) @@ -124,6 +122,22 @@ static inline bool is_shared_processor(void) #endif } +static inline void spin_yield(arch_spinlock_t *lock) +{ + if (is_shared_processor()) + splpar_spin_yield(lock); + else + barrier(); +} + +static inline void rw_yield(arch_rwlock_t *lock) +{ + if (is_shared_processor()) + splpar_rw_yield(lock); + else + barrier(); +} + static inline void arch_spin_lock(arch_spinlock_t *lock) { while (1) { @@ -132,7 +146,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock) do { HMT_low(); if (is_shared_processor()) - __spin_yield(lock); + splpar_spin_yield(lock); } while (unlikely(lock->slock != 0)); HMT_medium(); } @@ -151,7 +165,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) do { HMT_low(); if (is_shared_processor()) - __spin_yield(lock); + splpar_spin_yield(lock); } while (unlikely(lock->slock != 0)); HMT_medium(); local_irq_restore(flags_dis); @@ -241,7 +255,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw) do { HMT_low(); if (is_shared_processor()) - __rw_yield(rw); + splpar_rw_yield(rw); } while (unlikely(rw->lock < 0)); HMT_medium(); } @@ -255,7 +269,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw) do { HMT_low(); if (is_shared_processor()) - __rw_yield(rw); + splpar_rw_yield(rw); } while (unlikely(rw->lock != 0)); HMT_medium(); } @@ -295,9 +309,9 @@ static inline void arch_write_unlock(arch_rwlock_t *rw) rw->lock = 0; } -#define arch_spin_relax(lock) __spin_yield(lock) -#define arch_read_relax(lock) __rw_yield(lock) -#define arch_write_relax(lock) __rw_yield(lock) +#define arch_spin_relax(lock) spin_yield(lock) +#define arch_read_relax(lock) rw_yield(lock) +#define arch_write_relax(lock) rw_yield(lock) /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() -- cgit v1.2.3 From 63ce271b5e377deaddace4bac6dafb6e79d2bee4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 26 Aug 2019 11:10:23 +0000 Subject: powerpc/prom: convert PROM_BUG() to standard trap Prior to commit 1bd98d7fbaf5 ("ppc64: Update BUG handling based on ppc32"), BUG() family was using BUG_ILLEGAL_INSTRUCTION which was an invalid instruction opcode to trap into program check exception. That commit converted them to using standard trap instructions, but prom/prom_init and their PROM_BUG() macro were left over. head_64.S and exception-64s.S were left aside as well. Convert them to using the standard BUG infrastructure. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/cdaf4bbbb64c288a077845846f04b12683f8875a.1566817807.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/bug.h | 8 -------- arch/powerpc/kernel/exceptions-64s.S | 3 ++- arch/powerpc/kernel/head_64.S | 6 ++++-- arch/powerpc/kernel/prom_init.c | 2 +- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index fed7e6241349..f47e6ff6554d 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -5,14 +5,6 @@ #include -/* - * Define an illegal instr to trap on the bug. - * We don't use 0 because that marks the end of a function - * in the ELF ABI. That's "Boo Boo" in case you wonder... - */ -#define BUG_OPCODE .long 0x00b00b00 /* For asm */ -#define BUG_ILLEGAL_INSTR "0x00b00b00" /* For BUG macro */ - #ifdef CONFIG_BUG #ifdef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6ba3cc2ef8ab..dded4672579d 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1467,7 +1467,8 @@ EXC_COMMON_BEGIN(fp_unavailable_common) RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_fp_unavailable_exception - BUG_OPCODE +0: trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 1: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 0f2d61af47cc..ad79fddb974d 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -182,7 +182,8 @@ __secondary_hold: isync bctr #else - BUG_OPCODE +0: trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 #endif CLOSE_FIXED_SECTION(first_256B) @@ -998,7 +999,8 @@ start_here_common: bl start_kernel /* Not reached */ - BUG_OPCODE + trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 /* * We put a few things here that have to be page-aligned. diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 514707ef6779..f2b63b4e1943 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -94,7 +94,7 @@ static int of_workarounds __prombss; #define PROM_BUG() do { \ prom_printf("kernel BUG at %s line 0x%x!\n", \ __FILE__, __LINE__); \ - __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR); \ + __builtin_trap(); \ } while (0) #ifdef DEBUG_PROM -- cgit v1.2.3 From a04565741284f695db4cfe5a3e61940d2259cb8f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Aug 2019 20:00:34 +0000 Subject: powerpc/8xx: drop unused self-modifying code alternative to FixupDAR. The code which fixups the DAR on TLB errors for dbcX instructions has a self-modifying code alternative that has never been used. Drop it. Signed-off-by: Christophe Leroy Reviewed-by: Joakim Tjernlund Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b095e12c82fcba1ac4c09fc3b85d969f36614746.1566417610.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 5ab9178c2347..5db461db63cc 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -574,8 +574,6 @@ InstructionBreakpoint: * by decoding the registers used by the dcbx instruction and adding them. * DAR is set to the calculated address. */ - /* define if you don't want to use self modifying code */ -#define NO_SELF_MODIFYING_CODE FixupDAR:/* Entry point for dcbx workaround. */ mtspr SPRN_M_TW, r10 /* fetch instruction from memory. */ @@ -639,27 +637,6 @@ FixupDAR:/* Entry point for dcbx workaround. */ rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */ mtspr SPRN_DSISR, r10 142: /* continue, it was a dcbx, dcbi instruction. */ -#ifndef NO_SELF_MODIFYING_CODE - andis. r10,r11,0x1f /* test if reg RA is r0 */ - li r10,modified_instr@l - dcbtst r0,r10 /* touch for store */ - rlwinm r11,r11,0,0,20 /* Zero lower 10 bits */ - oris r11,r11,640 /* Transform instr. to a "add r10,RA,RB" */ - ori r11,r11,532 - stw r11,0(r10) /* store add/and instruction */ - dcbf 0,r10 /* flush new instr. to memory. */ - icbi 0,r10 /* invalidate instr. cache line */ - mfspr r11, SPRN_SPRG_SCRATCH1 /* restore r11 */ - mfspr r10, SPRN_SPRG_SCRATCH0 /* restore r10 */ - isync /* Wait until new instr is loaded from memory */ -modified_instr: - .space 4 /* this is where the add instr. is stored */ - bne+ 143f - subf r10,r0,r10 /* r10=r10-r0, only if reg RA is r0 */ -143: mtdar r10 /* store faulting EA in DAR */ - mfspr r10,SPRN_M_TW - b DARFixed /* Go back to normal TLB handling */ -#else mfctr r10 mtdar r10 /* save ctr reg in DAR */ rlwinm r10, r11, 24, 24, 28 /* offset into jump table for reg RB */ @@ -723,7 +700,6 @@ modified_instr: add r10, r10, r11 /* add it */ mfctr r11 /* restore r11 */ b 151b -#endif /* * This is where the main kernel code starts. -- cgit v1.2.3 From 3bbd2343734e44de92238eea1a5cd3ad32a6baf0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Aug 2019 10:20:51 +0000 Subject: powerpc/8xx: set STACK_END_MAGIC earlier on the init_stack Today, the STACK_END_MAGIC is set on init_stack in start_kernel(). To avoid a false 'Thread overran stack, or stack corrupted' message on early Oopses, setup STACK_END_MAGIC as soon as possible. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/54f67bb7ac486c1350f2fa8905cd279f94b9dfb1.1566382841.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 5db461db63cc..19f583e18402 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -717,6 +718,9 @@ start_here: /* stack */ lis r1,init_thread_union@ha addi r1,r1,init_thread_union@l + lis r0, STACK_END_MAGIC@h + ori r0, r0, STACK_END_MAGIC@l + stw r0, 0(r1) li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) -- cgit v1.2.3 From f7a0bf7d904e902e13024986b7b357181ee30849 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 26 Aug 2019 15:52:13 +0000 Subject: powerpc/32s: add an option to exclusively select powerpc 601 Powerpc 601 is rather old powerpc which as some important limitations compared to other book3s/32 powerpcs: - No Timebase. - Common BATs for instruction and data. - No execution protection in segment registers. - No RI bit in MSR - ... It is starting to be difficult and cumbersome to maintain kernels that are compatible both with 601 and other 6xx cores. Create a compiletime option to exclusively select either powerpc 601 or other 6xx. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d644eaf7dff8cc149260066802af230bdf34fded.1566834712.git.christophe.leroy@c-s.fr --- arch/powerpc/platforms/Kconfig.cputype | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 56a7c814160d..68c5cc075bfc 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -6,6 +6,9 @@ config PPC64 This option selects whether a 32-bit or a 64-bit kernel will be built. +config PPC_BOOK3S_32 + bool + menu "Processor support" choice prompt "Processor Type" @@ -21,13 +24,20 @@ choice If unsure, select 52xx/6xx/7xx/74xx/82xx/83xx/86xx. -config PPC_BOOK3S_32 - bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx" +config PPC_BOOK3S_6xx + bool "512x/52xx/6xx/7xx/74xx/82xx/83xx/86xx except 601" + select PPC_BOOK3S_32 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select PPC_HAVE_KUEP select PPC_HAVE_KUAP +config PPC_BOOK3S_601 + bool "PowerPC 601" + select PPC_BOOK3S_32 + select PPC_FPU + select PPC_HAVE_KUAP + config PPC_85xx bool "Freescale 85xx" select E500 -- cgit v1.2.3 From 12c3f1fd87bf4e55f06d079a45d6f15e2f6f9750 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 26 Aug 2019 15:52:14 +0000 Subject: powerpc/32s: get rid of CPU_FTR_601 feature Now that 601 is exclusive from other 6xx, CPU_FTR_601 and associated fixups are useless. Drop this feature and use #ifdefs instead. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ecdb7194a17dbfa01865df6a82979533adc2c70b.1566834712.git.christophe.leroy@c-s.fr --- arch/powerpc/configs/pmac32_defconfig | 1 - arch/powerpc/include/asm/cputable.h | 14 ++++++++------ arch/powerpc/include/asm/ppc_asm.h | 28 +++++++--------------------- arch/powerpc/include/asm/ptrace.h | 6 +++++- arch/powerpc/include/asm/timex.h | 34 +++------------------------------- arch/powerpc/kernel/cputable.c | 6 ++++-- arch/powerpc/kernel/entry_32.S | 22 ++++++++++++++++------ arch/powerpc/mm/book3s32/mmu.c | 15 ++++++--------- arch/powerpc/mm/ptdump/bats.c | 2 +- arch/powerpc/platforms/Kconfig | 3 ++- 10 files changed, 52 insertions(+), 79 deletions(-) diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 7e6654848531..4e6e95f92646 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -20,7 +20,6 @@ CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_PMAC=y -CONFIG_PPC601_SYNC_FIX=y CONFIG_GEN_RTC=y CONFIG_HIGHMEM=y CONFIG_BINFMT_MISC=m diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index d05f0c28e515..0aad095896d6 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -145,7 +145,6 @@ static inline void cpu_feature_keys_init(void) { } /* Definitions for features that only exist on 32-bit chips */ #ifdef CONFIG_PPC32 -#define CPU_FTR_601 ASM_CONST(0x00001000) #define CPU_FTR_L2CR ASM_CONST(0x00002000) #define CPU_FTR_SPEC7450 ASM_CONST(0x00004000) #define CPU_FTR_TAU ASM_CONST(0x00008000) @@ -167,7 +166,6 @@ static inline void cpu_feature_keys_init(void) { } #else /* CONFIG_PPC32 */ /* Define these to 0 for the sake of tests in common code */ -#define CPU_FTR_601 (0) #define CPU_FTR_PPC_LE (0) #endif @@ -294,7 +292,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_MAYBE_CAN_NAP 0 #endif -#define CPU_FTRS_PPC601 (CPU_FTR_COMMON | CPU_FTR_601 | \ +#define CPU_FTRS_PPC601 (CPU_FTR_COMMON | \ CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_USE_RTC) #define CPU_FTRS_603 (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \ CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE) @@ -498,7 +496,9 @@ static inline void cpu_feature_keys_init(void) { } #else enum { CPU_FTRS_POSSIBLE = -#ifdef CONFIG_PPC_BOOK3S_32 +#ifdef CONFIG_PPC_BOOK3S_601 + CPU_FTRS_PPC601 | +#elif defined(CONFIG_PPC_BOOK3S_32) CPU_FTRS_PPC601 | CPU_FTRS_603 | CPU_FTRS_604 | CPU_FTRS_740_NOTAU | CPU_FTRS_740 | CPU_FTRS_750 | CPU_FTRS_750FX1 | CPU_FTRS_750FX2 | CPU_FTRS_750FX | CPU_FTRS_750GX | @@ -574,8 +574,10 @@ enum { #else enum { CPU_FTRS_ALWAYS = -#ifdef CONFIG_PPC_BOOK3S_32 - CPU_FTRS_PPC601 & CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU & +#ifdef CONFIG_PPC_BOOK3S_601 + CPU_FTRS_PPC601 & +#elif defined(CONFIG_PPC_BOOK3S_32) + CPU_FTRS_603 & CPU_FTRS_604 & CPU_FTRS_740_NOTAU & CPU_FTRS_740 & CPU_FTRS_750 & CPU_FTRS_750FX1 & CPU_FTRS_750FX2 & CPU_FTRS_750FX & CPU_FTRS_750GX & CPU_FTRS_7400_NOTAU & CPU_FTRS_7400 & CPU_FTRS_7450_20 & diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index dd3b191bdcea..6b03dff61a05 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -383,19 +383,9 @@ n: /* various errata or part fixups */ #ifdef CONFIG_PPC601_SYNC_FIX -#define SYNC \ -BEGIN_FTR_SECTION \ - sync; \ - isync; \ -END_FTR_SECTION_IFSET(CPU_FTR_601) -#define SYNC_601 \ -BEGIN_FTR_SECTION \ - sync; \ -END_FTR_SECTION_IFSET(CPU_FTR_601) -#define ISYNC_601 \ -BEGIN_FTR_SECTION \ - isync; \ -END_FTR_SECTION_IFSET(CPU_FTR_601) +#define SYNC sync; isync +#define SYNC_601 sync +#define ISYNC_601 isync #else #define SYNC #define SYNC_601 @@ -421,15 +411,11 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #define MFTBU(dest) mfspr dest, SPRN_TBRU #endif -#ifndef CONFIG_SMP -#define TLBSYNC -#else /* CONFIG_SMP */ /* tlbsync is not implemented on 601 */ -#define TLBSYNC \ -BEGIN_FTR_SECTION \ - tlbsync; \ - sync; \ -END_FTR_SECTION_IFCLR(CPU_FTR_601) +#if !defined(CONFIG_SMP) || defined(CONFIG_PPC_BOOK3S_601) +#define TLBSYNC +#else +#define TLBSYNC tlbsync; sync #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index feee1b21bbd5..ee3ada66deb5 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -203,7 +203,11 @@ do { \ #endif /* __powerpc64__ */ #define arch_has_single_step() (1) -#define arch_has_block_step() (!cpu_has_feature(CPU_FTR_601)) +#ifndef CONFIG_BOOK3S_601 +#define arch_has_block_step() (true) +#else +#define arch_has_block_step() (false) +#endif #define ARCH_HAS_USER_SINGLE_STEP_REPORT /* diff --git a/arch/powerpc/include/asm/timex.h b/arch/powerpc/include/asm/timex.h index 926b9f91a3ef..d2d2c4bd8435 100644 --- a/arch/powerpc/include/asm/timex.h +++ b/arch/powerpc/include/asm/timex.h @@ -17,38 +17,10 @@ typedef unsigned long cycles_t; static inline cycles_t get_cycles(void) { -#ifdef __powerpc64__ + if (IS_ENABLED(CONFIG_BOOK3S_601)) + return 0; + return mftb(); -#else - cycles_t ret; - - /* - * For the "cycle" counter we use the timebase lower half. - * Currently only used on SMP. - */ - - ret = 0; - - __asm__ __volatile__( -#ifdef CONFIG_PPC_8xx - "97: mftb %0\n" -#else - "97: mfspr %0, %2\n" -#endif - "99:\n" - ".section __ftr_fixup,\"a\"\n" - ".align 2\n" - "98:\n" - " .long %1\n" - " .long 0\n" - " .long 97b-98b\n" - " .long 99b-98b\n" - " .long 0\n" - " .long 0\n" - ".previous" - : "=r" (ret) : "i" (CPU_FTR_601), "i" (SPRN_TBRL)); - return ret; -#endif } #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index bfe5f4a2886b..e745abc5457a 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -569,7 +569,7 @@ static struct cpu_spec __initdata cpu_specs[] = { #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC32 -#ifdef CONFIG_PPC_BOOK3S_32 +#ifdef CONFIG_PPC_BOOK3S_601 { /* 601 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00010000, @@ -583,6 +583,8 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "ppc601", }, +#endif /* CONFIG_PPC_BOOK3S_601 */ +#ifdef CONFIG_PPC_BOOK3S_6xx { /* 603 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00030000, @@ -1212,7 +1214,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "ppc603", }, -#endif /* CONFIG_PPC_BOOK3S_32 */ +#endif /* CONFIG_PPC_BOOK3S_6xx */ #ifdef CONFIG_PPC_8xx { /* 8xx */ .pvr_mask = 0xffff0000, diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 972b05504a0a..d60908ea37fb 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -777,11 +777,19 @@ fast_exception_return: 1: lis r3,exc_exit_restart_end@ha addi r3,r3,exc_exit_restart_end@l cmplw r12,r3 +#if CONFIG_PPC_BOOK3S_601 + bge 2b +#else bge 3f +#endif lis r4,exc_exit_restart@ha addi r4,r4,exc_exit_restart@l cmplw r12,r4 +#if CONFIG_PPC_BOOK3S_601 + blt 2b +#else blt 3f +#endif lis r3,fee_restarts@ha tophys(r3,r3) lwz r5,fee_restarts@l(r3) @@ -800,9 +808,6 @@ fee_restarts: /* aargh, we don't know which trap this is */ /* but the 601 doesn't implement the RI bit, so assume it's OK */ 3: -BEGIN_FTR_SECTION - b 2b -END_FTR_SECTION_IFSET(CPU_FTR_601) li r10,-1 stw r10,_TRAP(r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -1270,11 +1275,19 @@ nonrecoverable: lis r10,exc_exit_restart_end@ha addi r10,r10,exc_exit_restart_end@l cmplw r12,r10 +#ifdef CONFIG_PPC_BOOK3S_601 + bgelr +#else bge 3f +#endif lis r11,exc_exit_restart@ha addi r11,r11,exc_exit_restart@l cmplw r12,r11 +#ifdef CONFIG_PPC_BOOK3S_601 + bltlr +#else blt 3f +#endif lis r10,ee_restarts@ha lwz r12,ee_restarts@l(r10) addi r12,r12,1 @@ -1283,9 +1296,6 @@ nonrecoverable: blr 3: /* OK, we can't recover, kill this process */ /* but the 601 doesn't implement the RI bit, so assume it's OK */ -BEGIN_FTR_SECTION - blr -END_FTR_SECTION_IFSET(CPU_FTR_601) lwz r3,_TRAP(r1) andi. r0,r3,1 beq 5f diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 50a1991d257f..84d5fab94f8f 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -74,7 +74,7 @@ static int find_free_bat(void) { int b; - if (cpu_has_feature(CPU_FTR_601)) { + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { for (b = 0; b < 4; b++) { struct ppc_bat *bat = BATS[b]; @@ -106,7 +106,7 @@ static int find_free_bat(void) */ static unsigned int block_size(unsigned long base, unsigned long top) { - unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; + unsigned int max_size = IS_ENABLED(CONFIG_PPC_BOOK3S_601) ? SZ_8M : SZ_256M; unsigned int base_shift = (ffs(base) - 1) & 31; unsigned int block_shift = (fls(top - base) - 1) & 31; @@ -189,7 +189,7 @@ void mmu_mark_initmem_nx(void) unsigned long top = (unsigned long)_etext - PAGE_OFFSET; unsigned long size; - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return; for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { @@ -227,7 +227,7 @@ void mmu_mark_rodata_ro(void) int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return; for (i = 0; i < nb; i++) { @@ -259,7 +259,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, flags &= ~_PAGE_COHERENT; bl = (size >> 17) - 1; - if (PVR_VER(mfspr(SPRN_PVR)) != 1) { + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_601)) { /* 603, 604, etc. */ /* Do DBAT first */ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE @@ -441,7 +441,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, BUG_ON(first_memblock_base != 0); /* 601 can only access 16MB at the moment */ - if (PVR_VER(mfspr(SPRN_PVR)) == 1) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); else /* Anything else has 256M mapped */ memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); @@ -459,9 +459,6 @@ void __init setup_kuep(bool disabled) { pr_info("Activating Kernel Userspace Execution Prevention\n"); - if (cpu_has_feature(CPU_FTR_601)) - pr_warn("KUEP is not working on powerpc 601 (No NX bit in Seg Regs)\n"); - if (disabled) pr_warn("KUEP cannot be disabled yet on 6xx when compiled in\n"); } diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index a0d23e96e841..4154feac1da3 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -149,7 +149,7 @@ static int bats_show_603(struct seq_file *m, void *v) static int bats_open(struct inode *inode, struct file *file) { - if (cpu_has_feature(CPU_FTR_601)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) return single_open(file, bats_show_601, NULL); return single_open(file, bats_show_603, NULL); diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index f3fb79fccc72..d82e3664ffdf 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -197,7 +197,8 @@ endmenu config PPC601_SYNC_FIX bool "Workarounds for PPC601 bugs" - depends on PPC_BOOK3S_32 && PPC_PMAC + depends on PPC_BOOK3S_601 && PPC_PMAC + default y help Some versions of the PPC601 (the first PowerPC chip) have bugs which mean that extra synchronization instructions are required near -- cgit v1.2.3 From 88fb309409ab454b497a6abb0f931ce3b6d9969c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 26 Aug 2019 15:52:16 +0000 Subject: powerpc/32s: drop CPU_FTR_USE_RTC feature CPU_FTR_USE_RTC feature only applies to powerpc601. Drop this feature and replace it with tests on CONFIG_PPC_BOOK3S_601. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/170411e2360861f4a95c21faad43519a08bc4040.1566834712.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/cputable.h | 3 +-- arch/powerpc/include/asm/time.h | 6 +----- arch/powerpc/kernel/vdso.c | 22 ---------------------- arch/powerpc/kernel/vdso32/datapage.S | 2 ++ arch/powerpc/kernel/vdso32/vdso32.lds.S | 4 +++- 5 files changed, 7 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 0aad095896d6..fac1b81bbbd9 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -149,7 +149,6 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_SPEC7450 ASM_CONST(0x00004000) #define CPU_FTR_TAU ASM_CONST(0x00008000) #define CPU_FTR_CAN_DOZE ASM_CONST(0x00010000) -#define CPU_FTR_USE_RTC ASM_CONST(0x00020000) #define CPU_FTR_L3CR ASM_CONST(0x00040000) #define CPU_FTR_L3_DISABLE_NAP ASM_CONST(0x00080000) #define CPU_FTR_NAP_DISABLE_L2_PR ASM_CONST(0x00100000) @@ -293,7 +292,7 @@ static inline void cpu_feature_keys_init(void) { } #endif #define CPU_FTRS_PPC601 (CPU_FTR_COMMON | \ - CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_USE_RTC) + CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE) #define CPU_FTRS_603 (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \ CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE) #define CPU_FTRS_604 (CPU_FTR_COMMON | CPU_FTR_PPC_LE) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 54f4ec1f9fab..08dbe3e6831c 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -41,11 +41,7 @@ struct div_result { /* Accessor functions for the timebase (RTC on 601) registers. */ /* If one day CONFIG_POWER is added just define __USE_RTC as 1 */ -#ifdef CONFIG_PPC_BOOK3S_32 -#define __USE_RTC() (cpu_has_feature(CPU_FTR_USE_RTC)) -#else -#define __USE_RTC() 0 -#endif +#define __USE_RTC() (IS_ENABLED(CONFIG_PPC_BOOK3S_601)) #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index d60598113a9f..eae9ddaecbcf 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -94,28 +94,6 @@ static struct vdso_patch_def vdso_patches[] = { CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, "__kernel_sync_dicache", "__kernel_sync_dicache_p5" }, -#ifdef CONFIG_PPC32 - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_gettimeofday", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_clock_gettime", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_clock_getres", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_get_tbfreq", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_time", NULL - }, -#endif }; /* diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 6984125b9fc0..6c7401bd284e 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -70,6 +70,7 @@ V_FUNCTION_END(__kernel_get_syscall_map) * * returns the timebase frequency in HZ */ +#ifndef CONFIG_PPC_BOOK3S_601 V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 @@ -82,3 +83,4 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) +#endif diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 099a6db14e67..00c025ba4a92 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -144,10 +144,13 @@ VERSION __kernel_datapage_offset; __kernel_get_syscall_map; +#ifndef CONFIG_PPC_BOOK3S_601 __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; + __kernel_time; __kernel_get_tbfreq; +#endif __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp32; @@ -155,7 +158,6 @@ VERSION #ifdef CONFIG_PPC64 __kernel_getcpu; #endif - __kernel_time; local: *; }; -- cgit v1.2.3 From 39097b9c6d762d3fcd6f753e05ee3e34ec250ff3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 26 Aug 2019 15:52:17 +0000 Subject: powerpc/32s: use CONFIG_PPC_BOOK3S_601 instead of reading PVR Use CONFIG_PPC_BOOK3S_601 instead of reading PVR to know if it is a 601 or not. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/909c26db9facd7fe454695b303f952e019dd9eda.1566834712.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_32.S | 49 +++++++++++++++++++------------------------ arch/powerpc/kernel/misc_32.S | 6 ++---- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 9e6f01abb31e..4a24f8f026c7 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -34,7 +34,16 @@ #include "head_32.h" -/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */ +/* 601 only have IBAT */ +#ifdef CONFIG_PPC_BOOK3S_601 +#define LOAD_BAT(n, reg, RA, RB) \ + li RA,0; \ + mtspr SPRN_IBAT##n##U,RA; \ + lwz RA,(n*16)+0(reg); \ + lwz RB,(n*16)+4(reg); \ + mtspr SPRN_IBAT##n##U,RA; \ + mtspr SPRN_IBAT##n##L,RB +#else #define LOAD_BAT(n, reg, RA, RB) \ /* see the comment for clear_bats() -- Cort */ \ li RA,0; \ @@ -44,12 +53,11 @@ lwz RB,(n*16)+4(reg); \ mtspr SPRN_IBAT##n##U,RA; \ mtspr SPRN_IBAT##n##L,RB; \ - beq 1f; \ lwz RA,(n*16)+8(reg); \ lwz RB,(n*16)+12(reg); \ mtspr SPRN_DBAT##n##U,RA; \ - mtspr SPRN_DBAT##n##L,RB; \ -1: + mtspr SPRN_DBAT##n##L,RB +#endif __HEAD .stabs "arch/powerpc/kernel/",N_SO,0,0,0f @@ -820,9 +828,6 @@ load_up_mmu: /* Load the BAT registers with the values set up by MMU_init. MMU_init takes care of whether we're on a 601 or not. */ - mfpvr r3 - srwi r3,r3,16 - cmpwi r3,1 lis r3,BATS@ha addi r3,r3,BATS@l tophys(r3,r3) @@ -998,11 +1003,8 @@ EXPORT_SYMBOL(switch_mmu_context) */ clear_bats: li r10,0 - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi r9, 1 - beq 1f +#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT0U,r10 mtspr SPRN_DBAT0L,r10 mtspr SPRN_DBAT1U,r10 @@ -1011,7 +1013,7 @@ clear_bats: mtspr SPRN_DBAT2L,r10 mtspr SPRN_DBAT3U,r10 mtspr SPRN_DBAT3L,r10 -1: +#endif mtspr SPRN_IBAT0U,r10 mtspr SPRN_IBAT0L,r10 mtspr SPRN_IBAT1U,r10 @@ -1106,10 +1108,7 @@ mmu_off: */ initial_bats: lis r11,PAGE_OFFSET@h - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi 0,r9,1 - bne 4f +#ifdef CONFIG_PPC_BOOK3S_601 ori r11,r11,4 /* set up BAT registers for 601 */ li r8,0x7f /* valid, block length = 8MB */ mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */ @@ -1122,10 +1121,8 @@ initial_bats: addis r8,r8,0x800000@h mtspr SPRN_IBAT2U,r11 mtspr SPRN_IBAT2L,r8 - isync - blr - -4: tophys(r8,r11) +#else + tophys(r8,r11) #ifdef CONFIG_SMP ori r8,r8,0x12 /* R/W access, M=1 */ #else @@ -1137,10 +1134,10 @@ initial_bats: mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */ mtspr SPRN_IBAT0L,r8 mtspr SPRN_IBAT0U,r11 +#endif isync blr - #ifdef CONFIG_BOOTX_TEXT setup_disp_bat: /* @@ -1155,15 +1152,13 @@ setup_disp_bat: beqlr lwz r11,0(r8) lwz r8,4(r8) - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi 0,r9,1 - beq 1f +#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT3L,r8 mtspr SPRN_DBAT3U,r11 - blr -1: mtspr SPRN_IBAT3L,r8 +#else + mtspr SPRN_IBAT3L,r8 mtspr SPRN_IBAT3U,r11 +#endif blr #endif /* CONFIG_BOOTX_TEXT */ diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 02d90e1ebf65..b917641cdaa6 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -303,11 +303,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE) mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 +#elif defined(CONFIG_PPC_BOOK3S_601) + blr /* for 601, do nothing */ #else - mfspr r3,SPRN_PVR - rlwinm r3,r3,16,16,31 - cmpwi 0,r3,1 - beqlr /* for 601, do nothing */ /* 603/604 processor - use invalidate-all bit in HID0 */ mfspr r3,SPRN_HID0 ori r3,r3,HID0_ICFI -- cgit v1.2.3 From e0291f1decd6e8d447067f7d2cf01b1091b7cb3f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 26 Aug 2019 15:52:18 +0000 Subject: powerpc/32: drop CPU_FTR_UNIFIED_ID_CACHE Only 601 and e200 have unified I/D cache. Drop the feature and use CONFIG_PPC_BOOK3S_601 and CONFIG_E200. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b5902144266d2f4eed1ffea53915bd0245841e02.1566834712.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/cputable.h | 5 ++--- arch/powerpc/kernel/misc_32.S | 4 ++-- arch/powerpc/kernel/setup_32.c | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index fac1b81bbbd9..a1ebcbc3931f 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -158,7 +158,6 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTR_NEED_COHERENT ASM_CONST(0x01000000) #define CPU_FTR_NO_BTIC ASM_CONST(0x02000000) #define CPU_FTR_PPC_LE ASM_CONST(0x04000000) -#define CPU_FTR_UNIFIED_ID_CACHE ASM_CONST(0x08000000) #define CPU_FTR_SPE ASM_CONST(0x10000000) #define CPU_FTR_NEED_PAIRED_STWCX ASM_CONST(0x20000000) #define CPU_FTR_INDEXED_DCR ASM_CONST(0x40000000) @@ -292,7 +291,7 @@ static inline void cpu_feature_keys_init(void) { } #endif #define CPU_FTRS_PPC601 (CPU_FTR_COMMON | \ - CPU_FTR_COHERENT_ICACHE | CPU_FTR_UNIFIED_ID_CACHE) + CPU_FTR_COHERENT_ICACHE) #define CPU_FTRS_603 (CPU_FTR_COMMON | CPU_FTR_MAYBE_CAN_DOZE | \ CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_PPC_LE | CPU_FTR_NOEXECUTE) #define CPU_FTRS_604 (CPU_FTR_COMMON | CPU_FTR_PPC_LE) @@ -383,7 +382,7 @@ static inline void cpu_feature_keys_init(void) { } #define CPU_FTRS_47X (CPU_FTRS_440x6) #define CPU_FTRS_E200 (CPU_FTR_SPE_COMP | \ CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \ - CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \ + CPU_FTR_NOEXECUTE | \ CPU_FTR_DEBUG_LVL_EXC) #define CPU_FTRS_E500 (CPU_FTR_MAYBE_CAN_DOZE | \ CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \ diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index b917641cdaa6..3d21fb110797 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -292,14 +292,14 @@ _GLOBAL(flush_instruction_cache) iccci 0,r3 #endif #elif defined(CONFIG_FSL_BOOKE) -BEGIN_FTR_SECTION +#ifdef CONFIG_E200 mfspr r3,SPRN_L1CSR0 ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC /* msync; isync recommended here */ mtspr SPRN_L1CSR0,r3 isync blr -END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE) +#endif mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 94517e4a2723..a7541edf0cdb 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -206,6 +206,6 @@ __init void initialize_cache_info(void) dcache_bsize = cur_cpu_spec->dcache_bsize; icache_bsize = cur_cpu_spec->icache_bsize; ucache_bsize = 0; - if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601) || IS_ENABLED(CONFIG_E200)) ucache_bsize = icache_bsize = dcache_bsize; } -- cgit v1.2.3 From c7bf1252d5b3891b4ab7072b240a8422fb9da793 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 26 Aug 2019 15:52:19 +0000 Subject: powerpc/32: don't use CPU_FTR_COHERENT_ICACHE Only 601 and E200 have CPU_FTR_COHERENT_ICACHE. Just use #ifdefs instead of feature fixup. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/5f3e92ccd64d06477b27626f6007a9da3b8da157.1566834712.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/misc_32.S | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 3d21fb110797..82df4b09e79f 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -324,10 +324,10 @@ EXPORT_SYMBOL(flush_instruction_cache) * flush_icache_range(unsigned long start, unsigned long stop) */ _GLOBAL(flush_icache_range) -BEGIN_FTR_SECTION +#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) PURGE_PREFETCHED_INS - blr /* for 601, do nothing */ -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) + blr /* for 601 and e200, do nothing */ +#else rlwinm r3,r3,0,0,31 - L1_CACHE_SHIFT subf r4,r3,r4 addi r4,r4,L1_CACHE_BYTES - 1 @@ -353,6 +353,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) sync /* additional sync needed on g4 */ isync blr +#endif _ASM_NOKPROBE_SYMBOL(flush_icache_range) EXPORT_SYMBOL(flush_icache_range) @@ -360,15 +361,15 @@ EXPORT_SYMBOL(flush_icache_range) * Flush a particular page from the data cache to RAM. * Note: this is necessary because the instruction cache does *not* * snoop from the data cache. - * This is a no-op on the 601 which has a unified cache. + * This is a no-op on the 601 and e200 which have a unified cache. * * void __flush_dcache_icache(void *page) */ _GLOBAL(__flush_dcache_icache) -BEGIN_FTR_SECTION +#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) PURGE_PREFETCHED_INS blr -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) +#else rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ mtctr r4 @@ -396,6 +397,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) sync isync blr +#endif #ifndef CONFIG_BOOKE /* @@ -407,10 +409,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) * void __flush_dcache_icache_phys(unsigned long physaddr) */ _GLOBAL(__flush_dcache_icache_phys) -BEGIN_FTR_SECTION +#if defined(CONFIG_PPC_BOOK3S_601) || defined(CONFIG_E200) PURGE_PREFETCHED_INS - blr /* for 601, do nothing */ -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) + blr /* for 601 and e200, do nothing */ +#else mfmsr r10 rlwinm r0,r10,0,28,26 /* clear DR */ mtmsr r0 @@ -431,6 +433,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) mtmsr r10 /* restore DR */ isync blr +#endif #endif /* CONFIG_BOOKE */ /* -- cgit v1.2.3 From facd04a904ff6cdc6ee85d6e85d500f478a1bec4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 27 Aug 2019 13:30:06 +1000 Subject: powerpc: convert to copy_thread_tls Commit 3033f14ab78c3 ("clone: support passing tls argument via C rather than pt_regs magic") introduced the HAVE_COPY_THREAD_TLS option. Use it to avoid a subtle assumption about the argument ordering of clone type syscalls. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190827033010.28090-2-npiggin@gmail.com --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/process.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3d65084f4066..cfb21856559b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -184,6 +184,7 @@ config PPC select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_CONTEXT_TRACKING if PPC64 + select HAVE_COPY_THREAD_TLS select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8fc4de0d22b4..24621e7e5033 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1600,8 +1600,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, + unsigned long tls) { struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); @@ -1642,10 +1643,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_PPC64 if (!is_32bit_task()) - childregs->gpr[13] = childregs->gpr[6]; + childregs->gpr[13] = tls; else #endif - childregs->gpr[2] = childregs->gpr[6]; + childregs->gpr[2] = tls; } f = ret_from_fork; -- cgit v1.2.3 From 555e28179d37e431e97d78d106fc917bec2c6f93 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 27 Aug 2019 13:30:07 +1000 Subject: powerpc/64: remove support for kernel-mode syscalls There is support for the kernel to execute the 'sc 0' instruction and make a system call to itself. This is a relic that is unused in the tree, therefore untested. It's also highly questionable for modules to be doing this. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190827033010.28090-3-npiggin@gmail.com --- arch/powerpc/kernel/entry_64.S | 21 ++++++--------------- arch/powerpc/kernel/exceptions-64s.S | 2 -- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 0a0b5310f54a..6467bdab8d40 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -69,24 +69,20 @@ BEGIN_FTR_SECTION bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif - andi. r10,r12,MSR_PR mr r10,r1 - addi r1,r1,-INT_FRAME_SIZE - beq- 1f ld r1,PACAKSAVE(r13) -1: std r10,0(r1) + std r10,0(r1) std r11,_NIP(r1) std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) - beq 2f /* if from kernel mode */ #ifdef CONFIG_PPC_FSL_BOOK3E START_BTB_FLUSH_SECTION BTB_FLUSH(r10) END_BTB_FLUSH_SECTION #endif ACCOUNT_CPU_USER_ENTRY(r13, r10, r11) -2: std r2,GPR2(r1) + std r2,GPR2(r1) std r3,GPR3(r1) mfcr r2 std r4,GPR4(r1) @@ -122,14 +118,13 @@ END_BTB_FLUSH_SECTION #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) BEGIN_FW_FTR_SECTION - beq 33f - /* if from user, see if there are any DTL entries to process */ + /* see if there are any DTL entries to process */ ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ ld r11,PACA_DTL_RIDX(r13) /* get log read index */ addi r10,r10,LPPACA_DTLIDX LDX_BE r10,0,r10 /* get log write index */ - cmpd cr1,r11,r10 - beq+ cr1,33f + cmpd r11,r10 + beq+ 33f bl accumulate_stolen_time REST_GPR(0,r1) REST_4GPRS(3,r1) @@ -203,6 +198,7 @@ system_call: /* label this so stack traces look sane */ mtctr r12 bctrl /* Call handler */ + /* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */ .Lsyscall_exit: std r3,RESULT(r1) @@ -216,11 +212,6 @@ system_call: /* label this so stack traces look sane */ ld r12, PACA_THREAD_INFO(r13) ld r8,_MSR(r1) -#ifdef CONFIG_PPC_BOOK3S - /* No MSR:RI on BookE */ - andi. r10,r8,MSR_RI - beq- .Lunrecov_restore -#endif /* * This is a few instructions into the actual syscall exit path (which actually diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index dded4672579d..520804351601 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1522,8 +1522,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * system call / hypercall (0xc00, 0x4c00) * * The system call exception is invoked with "sc 0" and does not alter HV bit. - * There is support for kernel code to invoke system calls but there are no - * in-tree users. * * The hypercall is invoked with "sc 1" and sets HV=1. * -- cgit v1.2.3 From f2902a2fb40c589b886d21518ef8a1ee87f76b0c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 15:22:30 +0200 Subject: powerpc: use the generic dma coherent remap allocator This switches to using common code for the DMA allocations, including potential use of the CMA allocator if configured. Switching to the generic code enables DMA allocations from atomic context, which is required by the DMA API documentation, and also adds various other minor features drivers start relying upon. It also makes sure we have on tested code base for all architectures that require uncached pte bits for coherent DMA allocations. Another advantage is that consistent memory allocations now share the general vmalloc pool instead of needing an explicit careout from it. Signed-off-by: Christoph Hellwig Tested-by: Christophe Leroy # tested on 8xx Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190814132230.31874-2-hch@lst.de --- arch/powerpc/Kconfig | 12 - arch/powerpc/include/asm/book3s/32/pgtable.h | 12 +- arch/powerpc/include/asm/nohash/32/pgtable.h | 12 +- arch/powerpc/mm/dma-noncoherent.c | 318 +-------------------------- arch/powerpc/mm/mem.c | 4 - arch/powerpc/mm/ptdump/ptdump.c | 9 - arch/powerpc/platforms/Kconfig.cputype | 2 + 7 files changed, 17 insertions(+), 352 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index cfb21856559b..edc19363a729 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1141,18 +1141,6 @@ config TASK_SIZE default "0x80000000" if PPC_8xx default "0xc0000000" -config CONSISTENT_SIZE_BOOL - bool "Set custom consistent memory pool size" - depends on ADVANCED_OPTIONS && NOT_COHERENT_CACHE - help - This option allows you to set the size of the - consistent memory pool. This pool of virtual memory - is used to make consistent memory allocations. - -config CONSISTENT_SIZE - hex "Size of consistent memory pool" if CONSISTENT_SIZE_BOOL - default "0x00200000" if NOT_COHERENT_CACHE - config PIN_TLB bool "Pinned Kernel TLBs (860 ONLY)" depends on ADVANCED_OPTIONS && PPC_8xx && \ diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 331a29a501a1..0796533d37dd 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -148,21 +148,15 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); */ #include -#ifdef CONFIG_HIGHMEM -#define KVIRT_TOP PKMAP_BASE -#else -#define KVIRT_TOP FIXADDR_START -#endif - /* * ioremap_bot starts at that address. Early ioremaps move down from there, * until mem_init() at which point this becomes the top of the vmalloc * and ioremap space */ -#ifdef CONFIG_NOT_COHERENT_CACHE -#define IOREMAP_TOP ((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK) +#ifdef CONFIG_HIGHMEM +#define IOREMAP_TOP PKMAP_BASE #else -#define IOREMAP_TOP KVIRT_TOP +#define IOREMAP_TOP FIXADDR_START #endif /* PPC32 shares vmalloc area with ioremap */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 3e1a4c1e40f0..552b96eef0c8 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -76,21 +76,15 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); */ #include -#ifdef CONFIG_HIGHMEM -#define KVIRT_TOP PKMAP_BASE -#else -#define KVIRT_TOP FIXADDR_START -#endif - /* * ioremap_bot starts at that address. Early ioremaps move down from there, * until mem_init() at which point this becomes the top of the vmalloc * and ioremap space */ -#ifdef CONFIG_NOT_COHERENT_CACHE -#define IOREMAP_TOP ((KVIRT_TOP - CONFIG_CONSISTENT_SIZE) & PAGE_MASK) +#ifdef CONFIG_HIGHMEM +#define IOREMAP_TOP PKMAP_BASE #else -#define IOREMAP_TOP KVIRT_TOP +#define IOREMAP_TOP FIXADDR_START #endif /* PPC32 shares vmalloc area with ioremap */ diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index c617282d5b2a..4272ca5e8159 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -4,310 +4,18 @@ * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) * * Copyright (C) 2000 Russell King - * - * Consistent memory allocators. Used for DMA devices that want to - * share uncached memory with the processor core. The function return - * is the virtual address and 'dma_handle' is the physical address. - * Mostly stolen from the ARM port, with some changes for PowerPC. - * -- Dan - * - * Reorganized to get rid of the arch-specific consistent_* functions - * and provide non-coherent implementations for the DMA API. -Matt - * - * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent() - * implementation. This is pulled straight from ARM and barely - * modified. -Matt */ -#include -#include #include #include -#include #include #include #include #include -#include #include #include -#include - -/* - * This address range defaults to a value that is safe for all - * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It - * can be further configured for specific applications under - * the "Advanced Setup" menu. -Matt - */ -#define CONSISTENT_BASE (IOREMAP_TOP) -#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE) -#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) - -/* - * This is the page table (2MB) covering uncached, DMA consistent allocations - */ -static DEFINE_SPINLOCK(consistent_lock); - -/* - * VM region handling support. - * - * This should become something generic, handling VM region allocations for - * vmalloc and similar (ioremap, module space, etc). - * - * I envisage vmalloc()'s supporting vm_struct becoming: - * - * struct vm_struct { - * struct vm_region region; - * unsigned long flags; - * struct page **pages; - * unsigned int nr_pages; - * unsigned long phys_addr; - * }; - * - * get_vm_area() would then call vm_region_alloc with an appropriate - * struct vm_region head (eg): - * - * struct vm_region vmalloc_head = { - * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), - * .vm_start = VMALLOC_START, - * .vm_end = VMALLOC_END, - * }; - * - * However, vmalloc_head.vm_start is variable (typically, it is dependent on - * the amount of RAM found at boot time.) I would imagine that get_vm_area() - * would have to initialise this each time prior to calling vm_region_alloc(). - */ -struct ppc_vm_region { - struct list_head vm_list; - unsigned long vm_start; - unsigned long vm_end; -}; - -static struct ppc_vm_region consistent_head = { - .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), - .vm_start = CONSISTENT_BASE, - .vm_end = CONSISTENT_END, -}; - -static struct ppc_vm_region * -ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp) -{ - unsigned long addr = head->vm_start, end = head->vm_end - size; - unsigned long flags; - struct ppc_vm_region *c, *new; - - new = kmalloc(sizeof(struct ppc_vm_region), gfp); - if (!new) - goto out; - - spin_lock_irqsave(&consistent_lock, flags); - - list_for_each_entry(c, &head->vm_list, vm_list) { - if ((addr + size) < addr) - goto nospc; - if ((addr + size) <= c->vm_start) - goto found; - addr = c->vm_end; - if (addr > end) - goto nospc; - } - - found: - /* - * Insert this entry _before_ the one we found. - */ - list_add_tail(&new->vm_list, &c->vm_list); - new->vm_start = addr; - new->vm_end = addr + size; - - spin_unlock_irqrestore(&consistent_lock, flags); - return new; - - nospc: - spin_unlock_irqrestore(&consistent_lock, flags); - kfree(new); - out: - return NULL; -} - -static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr) -{ - struct ppc_vm_region *c; - - list_for_each_entry(c, &head->vm_list, vm_list) { - if (c->vm_start == addr) - goto out; - } - c = NULL; - out: - return c; -} - -/* - * Allocate DMA-coherent memory space and return both the kernel remapped - * virtual and bus address for that space. - */ -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - struct page *page; - struct ppc_vm_region *c; - unsigned long order; - u64 mask = ISA_DMA_THRESHOLD, limit; - - if (dev) { - mask = dev->coherent_dma_mask; - - /* - * Sanity check the DMA mask - it must be non-zero, and - * must be able to be satisfied by a DMA allocation. - */ - if (mask == 0) { - dev_warn(dev, "coherent DMA mask is unset\n"); - goto no_page; - } - - if ((~mask) & ISA_DMA_THRESHOLD) { - dev_warn(dev, "coherent DMA mask %#llx is smaller " - "than system GFP_DMA mask %#llx\n", - mask, (unsigned long long)ISA_DMA_THRESHOLD); - goto no_page; - } - } - - - size = PAGE_ALIGN(size); - limit = (mask + 1) & ~mask; - if ((limit && size >= limit) || - size >= (CONSISTENT_END - CONSISTENT_BASE)) { - printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", - size, mask); - return NULL; - } - - order = get_order(size); - - /* Might be useful if we ever have a real legacy DMA zone... */ - if (mask != 0xffffffff) - gfp |= GFP_DMA; - - page = alloc_pages(gfp, order); - if (!page) - goto no_page; - - /* - * Invalidate any data that might be lurking in the - * kernel direct-mapped region for device DMA. - */ - { - unsigned long kaddr = (unsigned long)page_address(page); - memset(page_address(page), 0, size); - flush_dcache_range(kaddr, kaddr + size); - } - - /* - * Allocate a virtual address in the consistent mapping region. - */ - c = ppc_vm_region_alloc(&consistent_head, size, - gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); - if (c) { - unsigned long vaddr = c->vm_start; - struct page *end = page + (1 << order); - - split_page(page, order); - - /* - * Set the "dma handle" - */ - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - - do { - SetPageReserved(page); - map_kernel_page(vaddr, page_to_phys(page), - pgprot_noncached(PAGE_KERNEL)); - page++; - vaddr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - /* - * Free the otherwise unused pages. - */ - while (page < end) { - __free_page(page); - page++; - } - - return (void *)c->vm_start; - } - - if (page) - __free_pages(page, order); - no_page: - return NULL; -} - -/* - * free a page as defined by the above mapping. - */ -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - struct ppc_vm_region *c; - unsigned long flags, addr; - - size = PAGE_ALIGN(size); - - spin_lock_irqsave(&consistent_lock, flags); - - c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr); - if (!c) - goto no_area; - - if ((c->vm_end - c->vm_start) != size) { - printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", - __func__, c->vm_end - c->vm_start, size); - dump_stack(); - size = c->vm_end - c->vm_start; - } - - addr = c->vm_start; - do { - pte_t *ptep; - unsigned long pfn; - - ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr), - addr), - addr), - addr); - if (!pte_none(*ptep) && pte_present(*ptep)) { - pfn = pte_pfn(*ptep); - pte_clear(&init_mm, addr, ptep); - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - __free_reserved_page(page); - } - } - addr += PAGE_SIZE; - } while (size -= PAGE_SIZE); - - flush_tlb_kernel_range(c->vm_start, c->vm_end); - - list_del(&c->vm_list); - - spin_unlock_irqrestore(&consistent_lock, flags); - - kfree(c); - return; - - no_area: - spin_unlock_irqrestore(&consistent_lock, flags); - printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", - __func__, vaddr); - dump_stack(); -} - /* * make an area consistent. */ @@ -408,23 +116,15 @@ void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, __dma_sync_page(paddr, size, dir); } -/* - * Return the PFN for a given cpu virtual address returned by arch_dma_alloc. - */ -long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, - dma_addr_t dma_addr) +void arch_dma_prep_coherent(struct page *page, size_t size) { - /* This should always be populated, so we don't test every - * level. If that fails, we'll have a nice crash which - * will be as good as a BUG_ON() - */ - unsigned long cpu_addr = (unsigned long)vaddr; - pgd_t *pgd = pgd_offset_k(cpu_addr); - pud_t *pud = pud_offset(pgd, cpu_addr); - pmd_t *pmd = pmd_offset(pud, cpu_addr); - pte_t *ptep = pte_offset_kernel(pmd, cpu_addr); + unsigned long kaddr = (unsigned long)page_address(page); - if (pte_none(*ptep) || !pte_present(*ptep)) - return 0; - return pte_pfn(*ptep); + flush_dcache_range(kaddr, kaddr + size); +} + +static int __init atomic_pool_init(void) +{ + return dma_atomic_pool_init(GFP_KERNEL, pgprot_noncached(PAGE_KERNEL)); } +postcore_initcall(atomic_pool_init); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 69f99128a8d6..be941d382c8d 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -302,10 +302,6 @@ void __init mem_init(void) pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); #endif /* CONFIG_HIGHMEM */ -#ifdef CONFIG_NOT_COHERENT_CACHE - pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", - IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); -#endif /* CONFIG_NOT_COHERENT_CACHE */ if (ioremap_bot != IOREMAP_TOP) pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", ioremap_bot, IOREMAP_TOP); diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index ab6a572202b4..2f9ddc29c535 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -84,10 +84,6 @@ static struct addr_marker address_markers[] = { #else { 0, "Early I/O remap start" }, { 0, "Early I/O remap end" }, -#ifdef CONFIG_NOT_COHERENT_CACHE - { 0, "Consistent mem start" }, - { 0, "Consistent mem end" }, -#endif #ifdef CONFIG_HIGHMEM { 0, "Highmem PTEs start" }, { 0, "Highmem PTEs end" }, @@ -335,11 +331,6 @@ static void populate_markers(void) #else /* !CONFIG_PPC64 */ address_markers[i++].start_address = ioremap_bot; address_markers[i++].start_address = IOREMAP_TOP; -#ifdef CONFIG_NOT_COHERENT_CACHE - address_markers[i++].start_address = IOREMAP_TOP; - address_markers[i++].start_address = IOREMAP_TOP + - CONFIG_CONSISTENT_SIZE; -#endif #ifdef CONFIG_HIGHMEM address_markers[i++].start_address = PKMAP_BASE; address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 68c5cc075bfc..12543e53fa96 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -460,8 +460,10 @@ config NOT_COHERENT_CACHE depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_SYNC_DMA_FOR_CPU + select DMA_DIRECT_REMAP default n if PPC_47x default y -- cgit v1.2.3 From bc605cd79edb68131d3be5b00b949aa312277d39 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Aug 2019 18:44:17 +1000 Subject: powerpc/of/pci: Rewrite pci_parse_of_flags The existing code uses bunch of hardcoded values from the PCI Bus Binding to IEEE Std 1275 spec; and it does so in quite non-obvious way. This defines fields from the cell#0 of the "reg" property of a PCI device and uses them for parsing. This should cause no behavioral change. Signed-off-by: Alexey Kardashevskiy [mpe: Unsplit some 80/81 char lines, space the code with some newlines] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829084417.71873-1-aik@ozlabs.ru --- arch/powerpc/kernel/pci_of_scan.c | 66 ++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 409c6c1beabf..f91d7e94872e 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -34,31 +34,75 @@ static u32 get_int_prop(struct device_node *np, const char *name, u32 def) * pci_parse_of_flags - Parse the flags cell of a device tree PCI address * @addr0: value of 1st cell of a device tree PCI address. * @bridge: Set this flag if the address is from a bridge 'ranges' property + * + * PCI Bus Binding to IEEE Std 1275-1994 + * + * Bit# 33222222 22221111 11111100 00000000 + * 10987654 32109876 54321098 76543210 + * phys.hi cell: npt000ss bbbbbbbb dddddfff rrrrrrrr + * phys.mid cell: hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh + * phys.lo cell: llllllll llllllll llllllll llllllll + * + * where: + * n is 0 if the address is relocatable, 1 otherwise + * p is 1 if the addressable region is "prefetchable", 0 otherwise + * t is 1 if the address is aliased (for non-relocatable I/O), + * below 1 MB (for Memory),or below 64 KB (for relocatable I/O). + * ss is the space code, denoting the address space: + * 00 denotes Configuration Space + * 01 denotes I/O Space + * 10 denotes 32-bit-address Memory Space + * 11 denotes 64-bit-address Memory Space + * bbbbbbbb is the 8-bit Bus Number + * ddddd is the 5-bit Device Number + * fff is the 3-bit Function Number + * rrrrrrrr is the 8-bit Register Number */ +#define OF_PCI_ADDR0_SPACE(ss) (((ss)&3)<<24) +#define OF_PCI_ADDR0_SPACE_CFG OF_PCI_ADDR0_SPACE(0) +#define OF_PCI_ADDR0_SPACE_IO OF_PCI_ADDR0_SPACE(1) +#define OF_PCI_ADDR0_SPACE_MMIO32 OF_PCI_ADDR0_SPACE(2) +#define OF_PCI_ADDR0_SPACE_MMIO64 OF_PCI_ADDR0_SPACE(3) +#define OF_PCI_ADDR0_SPACE_MASK OF_PCI_ADDR0_SPACE(3) +#define OF_PCI_ADDR0_RELOC (1UL<<31) +#define OF_PCI_ADDR0_PREFETCH (1UL<<30) +#define OF_PCI_ADDR0_ALIAS (1UL<<29) +#define OF_PCI_ADDR0_BUS 0x00FF0000UL +#define OF_PCI_ADDR0_DEV 0x0000F800UL +#define OF_PCI_ADDR0_FN 0x00000700UL +#define OF_PCI_ADDR0_BARREG 0x000000FFUL + unsigned int pci_parse_of_flags(u32 addr0, int bridge) { - unsigned int flags = 0; + unsigned int flags = 0, as = addr0 & OF_PCI_ADDR0_SPACE_MASK; - if (addr0 & 0x02000000) { + if (as == OF_PCI_ADDR0_SPACE_MMIO32 || as == OF_PCI_ADDR0_SPACE_MMIO64) { flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; - flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; - if (flags & PCI_BASE_ADDRESS_MEM_TYPE_64) - flags |= IORESOURCE_MEM_64; - flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; - if (addr0 & 0x40000000) - flags |= IORESOURCE_PREFETCH - | PCI_BASE_ADDRESS_MEM_PREFETCH; + + if (as == OF_PCI_ADDR0_SPACE_MMIO64) + flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 | IORESOURCE_MEM_64; + + if (addr0 & OF_PCI_ADDR0_ALIAS) + flags |= PCI_BASE_ADDRESS_MEM_TYPE_1M; + + if (addr0 & OF_PCI_ADDR0_PREFETCH) + flags |= IORESOURCE_PREFETCH | + PCI_BASE_ADDRESS_MEM_PREFETCH; + /* Note: We don't know whether the ROM has been left enabled * by the firmware or not. We mark it as disabled (ie, we do * not set the IORESOURCE_ROM_ENABLE flag) for now rather than * do a config space read, it will be force-enabled if needed */ - if (!bridge && (addr0 & 0xff) == 0x30) + if (!bridge && (addr0 & OF_PCI_ADDR0_BARREG) == PCI_ROM_ADDRESS) flags |= IORESOURCE_READONLY; - } else if (addr0 & 0x01000000) + + } else if (as == OF_PCI_ADDR0_SPACE_IO) flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + if (flags) flags |= IORESOURCE_SIZEALIGN; + return flags; } -- cgit v1.2.3 From 4f916593be9da38c5cf0d3a5c386b57beb70f422 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 26 Aug 2019 14:55:20 +1000 Subject: KVM: PPC: Book3S: Fix incorrect guest-to-user-translation error handling H_PUT_TCE_INDIRECT handlers receive a page with up to 512 TCEs from a guest. Although we verify correctness of TCEs before we do anything with the existing tables, there is a small window when a check in kvmppc_tce_validate might pass and right after that the guest alters the page with TCEs which can cause early exit from the handler and leave srcu_read_lock(&vcpu->kvm->srcu) (virtual mode) or lock_rmap(rmap) (real mode) locked. This fixes the bug by jumping to the common exit code with an appropriate unlock. Fixes: 121f80ba68f1 ("KVM: PPC: VFIO: Add in-kernel acceleration for VFIO") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190826045520.92153-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio.c | 6 ++++-- arch/powerpc/kvm/book3s_64_vio_hv.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index e99a14798ab0..c4b606fe73eb 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -660,8 +660,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, } tce = be64_to_cpu(tce); - if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) - return H_PARAMETER; + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { + ret = H_PARAMETER; + goto unlock_exit; + } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index f50bbeedfc66..b4f20f13b860 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -556,8 +556,10 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); ua = 0; - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) - return H_PARAMETER; + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { + ret = H_PARAMETER; + goto unlock_exit; + } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, -- cgit v1.2.3 From 35872480da47ec714fd9c4f2f3d2d83daf304851 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Aug 2019 18:52:48 +1000 Subject: powerpc/powernv/ioda: Split out TCE invalidation from TCE updates At the moment updates in a TCE table are made by iommu_table_ops::exchange which update one TCE and invalidates an entry in the PHB/NPU TCE cache via set of registers called "TCE Kill" (hence the naming). Writing a TCE is a simple xchg() but invalidating the TCE cache is a relatively expensive OPAL call. Mapping a 100GB guest with PCI+NPU passed through devices takes about 20s. Thankfully we can do better. Since such big mappings happen at the boot time and when memory is plugged/onlined (i.e. not often), these requests come in 512 pages so we call call OPAL 512 times less which brings 20s from the above to less than 10s. Also, since TCE caches can be flushed entirely, calling OPAL for 512 TCEs helps skiboot [1] to decide whether to flush the entire cache or not. This implements 2 new iommu_table_ops callbacks: - xchg_no_kill() to update a single TCE with no TCE invalidation; - tce_kill() to invalidate multiple TCEs. This uses the same xchg_no_kill() callback for IODA1/2. This implements 2 new wrappers on top of the new callbacks similar to the existing iommu_tce_xchg(). This does not use the new callbacks yet, the next patches will; so this should not cause any behavioral change. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829085252.72370-2-aik@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 17 +++++++++++++++++ arch/powerpc/kernel/iommu.c | 27 +++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/pci-ioda.c | 12 ++++++++++++ 3 files changed, 56 insertions(+) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 18d342b815e4..babddede9245 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -58,6 +58,17 @@ struct iommu_table_ops { unsigned long *hpa, enum dma_data_direction *direction); + int (*xchg_no_kill)(struct iommu_table *tbl, + long index, + unsigned long *hpa, + enum dma_data_direction *direction, + bool realmode); + + void (*tce_kill)(struct iommu_table *tbl, + unsigned long index, + unsigned long pages, + bool realmode); + __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc); #endif void (*clear)(struct iommu_table *tbl, @@ -206,6 +217,12 @@ extern void iommu_del_device(struct device *dev); extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction); +extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, + struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction); +extern void iommu_tce_kill(struct iommu_table *tbl, + unsigned long entry, unsigned long pages); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0a67ce9f827e..145f29cf7e4c 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1005,6 +1005,33 @@ long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_xchg); +extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, + struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) +{ + long ret; + unsigned long size = 0; + + ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false); + if (!ret && ((*direction == DMA_FROM_DEVICE) || + (*direction == DMA_BIDIRECTIONAL)) && + !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, + &size)) + SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill); + +void iommu_tce_kill(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + if (tbl->it_ops->tce_kill) + tbl->it_ops->tce_kill(tbl, entry, pages, false); +} +EXPORT_SYMBOL_GPL(iommu_tce_kill); + int iommu_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d8080558d020..911f96abed89 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1939,6 +1939,14 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, } #ifdef CONFIG_IOMMU_API +/* Common for IODA1 and IODA2 */ +static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction, + bool realmode) +{ + return pnv_tce_xchg(tbl, index, hpa, direction, !realmode); +} + static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { @@ -1975,6 +1983,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda1_tce_xchg, .exchange_rm = pnv_ioda1_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_p7ioc_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda1_tce_free, @@ -2140,6 +2150,8 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda2_tce_xchg, .exchange_rm = pnv_ioda2_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_ioda2_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, -- cgit v1.2.3 From 01b7d128b5a7f0d09626c090093ff44155f347c9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Aug 2019 18:52:49 +1000 Subject: KVM: PPC: Book3S: Invalidate multiple TCEs at once Invalidating a TCE cache entry for each updated TCE is quite expensive. This makes use of the new iommu_table_ops::xchg_no_kill()/tce_kill() callbacks to bring down the time spent in mapping a huge guest DMA window; roughly 20s to 10s for each guest's 100GB of DMA space. Signed-off-by: Alexey Kardashevskiy Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829085252.72370-3-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio.c | 29 +++++++++++++++++++--------- arch/powerpc/kvm/book3s_64_vio_hv.c | 38 +++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index c4b606fe73eb..5834db0a54c6 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -416,7 +416,7 @@ static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg(mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir); } static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, @@ -447,7 +447,8 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, unsigned long hpa = 0; long ret; - if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir))) + if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, + &dir))) return H_TOO_HARD; if (dir == DMA_NONE) @@ -455,7 +456,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); if (ret != H_SUCCESS) - iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir); return ret; } @@ -501,7 +502,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (mm_iommu_mapped_inc(mem)) return H_TOO_HARD; - ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); + ret = iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir); if (WARN_ON_ONCE(ret)) { mm_iommu_mapped_dec(mem); return H_TOO_HARD; @@ -579,6 +580,8 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); + iommu_tce_kill(stit->tbl, entry, 1); + if (ret != H_SUCCESS) { kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); goto unlock_exit; @@ -656,13 +659,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ if (get_user(tce, tces + i)) { ret = H_TOO_HARD; - goto unlock_exit; + goto invalidate_exit; } tce = be64_to_cpu(tce); if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { ret = H_PARAMETER; - goto unlock_exit; + goto invalidate_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -673,13 +676,17 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) { kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); - goto unlock_exit; + goto invalidate_exit; } } kvmppc_tce_put(stt, entry + i, tce); } +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill(stit->tbl, entry, npages); + unlock_exit: srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -718,7 +725,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - return ret; + goto invalidate_exit; WARN_ON_ONCE(1); kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); @@ -728,6 +735,10 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); - return H_SUCCESS; +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages); + + return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index b4f20f13b860..ab6eeb8e753e 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -218,13 +218,14 @@ static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } -static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, +static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm, + struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction) { long ret; - ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); + ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true); if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL))) { @@ -240,13 +241,20 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, return ret; } +extern void iommu_tce_kill_rm(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + if (tbl->it_ops->tce_kill) + tbl->it_ops->tce_kill(tbl, entry, pages, true); +} + static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, unsigned long entry) { unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -278,7 +286,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, unsigned long hpa = 0; long ret; - if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) + if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir)) /* * real mode xchg can fail if struct page crosses * a page boundary @@ -290,7 +298,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); if (ret) - iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); return ret; } @@ -336,7 +344,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) return H_TOO_HARD; - ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); + ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); if (ret) { mm_iommu_mapped_dec(mem); /* @@ -417,6 +425,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); + iommu_tce_kill_rm(stit->tbl, entry, 1); + if (ret != H_SUCCESS) { kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); return ret; @@ -558,7 +568,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ua = 0; if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { ret = H_PARAMETER; - goto unlock_exit; + goto invalidate_exit; } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -569,13 +579,17 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) { kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); - goto unlock_exit; + goto invalidate_exit; } } kvmppc_rm_tce_put(stt, entry + i, tce); } +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill_rm(stit->tbl, entry, npages); + unlock_exit: if (rmap) unlock_rmap(rmap); @@ -618,7 +632,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - return ret; + goto invalidate_exit; WARN_ON_ONCE_RM(1); kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); @@ -628,7 +642,11 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); - return H_SUCCESS; +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages); + + return ret; } /* This can be called in either virtual mode or real mode */ -- cgit v1.2.3 From 650ab1e370cdb61ba5c7495006f4376e02374da0 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Aug 2019 18:52:50 +1000 Subject: vfio/spapr_tce: Invalidate multiple TCEs at once Invalidating a TCE cache entry for each updated TCE is quite expensive. This makes use of the new iommu_table_ops::xchg_no_kill()/tce_kill() callbacks to bring down the time spent in mapping a huge guest DMA window. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829085252.72370-4-aik@ozlabs.ru --- drivers/vfio/vfio_iommu_spapr_tce.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8ce9ad21129f..9809369e0ed3 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -435,7 +435,7 @@ static int tce_iommu_clear(struct tce_container *container, unsigned long oldhpa; long ret; enum dma_data_direction direction; - unsigned long lastentry = entry + pages; + unsigned long lastentry = entry + pages, firstentry = entry; for ( ; entry < lastentry; ++entry) { if (tbl->it_indirect_levels && tbl->it_userspace) { @@ -460,7 +460,7 @@ static int tce_iommu_clear(struct tce_container *container, direction = DMA_NONE; oldhpa = 0; - ret = iommu_tce_xchg(container->mm, tbl, entry, &oldhpa, + ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa, &direction); if (ret) continue; @@ -476,6 +476,8 @@ static int tce_iommu_clear(struct tce_container *container, tce_iommu_unuse_page(container, oldhpa); } + iommu_tce_kill(tbl, firstentry, pages); + return 0; } @@ -518,8 +520,8 @@ static long tce_iommu_build(struct tce_container *container, hpa |= offset; dirtmp = direction; - ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, - &dirtmp); + ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, + &hpa, &dirtmp); if (ret) { tce_iommu_unuse_page(container, hpa); pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", @@ -536,6 +538,8 @@ static long tce_iommu_build(struct tce_container *container, if (ret) tce_iommu_clear(container, tbl, entry, i); + else + iommu_tce_kill(tbl, entry, pages); return ret; } @@ -572,8 +576,8 @@ static long tce_iommu_build_v2(struct tce_container *container, if (mm_iommu_mapped_inc(mem)) break; - ret = iommu_tce_xchg(container->mm, tbl, entry + i, &hpa, - &dirtmp); + ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i, + &hpa, &dirtmp); if (ret) { /* dirtmp cannot be DMA_NONE here */ tce_iommu_unuse_page_v2(container, tbl, entry + i); @@ -593,6 +597,8 @@ static long tce_iommu_build_v2(struct tce_container *container, if (ret) tce_iommu_clear(container, tbl, entry, i); + else + iommu_tce_kill(tbl, entry, pages); return ret; } -- cgit v1.2.3 From 021b7868113cdca56297f2356d0f6fe89985c285 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Aug 2019 18:52:51 +1000 Subject: powerpc/pseries/iommu: Switch to xchg_no_kill This is the last implementation of iommu_table_ops::exchange() which we are about to remove. This implements xchg_no_kill() for pseries. Since it is paravirtual platform, the hypervisor does TCE invalidations and we do not have to deal with it here, hence no tce_kill() hook. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829085252.72370-5-aik@ozlabs.ru --- arch/powerpc/platforms/pseries/iommu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 889dc2e44b89..a22ef9c7de2e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -621,7 +621,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) #ifdef CONFIG_IOMMU_API static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned - long *tce, enum dma_data_direction *direction) + long *tce, enum dma_data_direction *direction, + bool realmode) { long rc; unsigned long ioba = (unsigned long) index << tbl->it_page_shift; @@ -649,7 +650,7 @@ static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned struct iommu_table_ops iommu_table_lpar_multi_ops = { .set = tce_buildmulti_pSeriesLP, #ifdef CONFIG_IOMMU_API - .exchange = tce_exchange_pseries, + .xchg_no_kill = tce_exchange_pseries, #endif .clear = tce_freemulti_pSeriesLP, .get = tce_get_pSeriesLP -- cgit v1.2.3 From a102f139aac54689eeb05883952742ae780159f3 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Aug 2019 18:52:52 +1000 Subject: powerpc/powernv/ioda: Remove obsolete iommu_table_ops::exchange callbacks As now we have xchg_no_kill/tce_kill, these are not used anymore so remove them. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829085252.72370-6-aik@ozlabs.ru --- arch/powerpc/include/asm/iommu.h | 10 ------- arch/powerpc/kernel/iommu.c | 26 +--------------- arch/powerpc/platforms/powernv/pci-ioda.c | 50 ------------------------------- 3 files changed, 1 insertion(+), 85 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index babddede9245..9ed0206cd98c 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -48,16 +48,6 @@ struct iommu_table_ops { * returns old TCE and DMA direction mask. * @tce is a physical address. */ - int (*exchange)(struct iommu_table *tbl, - long index, - unsigned long *hpa, - enum dma_data_direction *direction); - /* Real mode */ - int (*exchange_rm)(struct iommu_table *tbl, - long index, - unsigned long *hpa, - enum dma_data_direction *direction); - int (*xchg_no_kill)(struct iommu_table *tbl, long index, unsigned long *hpa, diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 145f29cf7e4c..bf803000e4b3 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -981,30 +981,6 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, - unsigned long entry, unsigned long *hpa, - enum dma_data_direction *direction) -{ - long ret; - unsigned long size = 0; - - ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); - - if (!ret && ((*direction == DMA_FROM_DEVICE) || - (*direction == DMA_BIDIRECTIONAL)) && - !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, - &size)) - SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); - - /* if (unlikely(ret)) - pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", - __func__, hwaddr, entry << tbl->it_page_shift, - hwaddr, ret); */ - - return ret; -} -EXPORT_SYMBOL_GPL(iommu_tce_xchg); - extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, @@ -1044,7 +1020,7 @@ int iommu_take_ownership(struct iommu_table *tbl) * requires exchange() callback defined so if it is not * implemented, we disallow taking ownership over the table. */ - if (!tbl->it_ops->exchange) + if (!tbl->it_ops->xchg_no_kill) return -EINVAL; spin_lock_irqsave(&tbl->large_pool.lock, flags); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 911f96abed89..22a51aa75796 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1946,28 +1946,6 @@ static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, { return pnv_tce_xchg(tbl, index, hpa, direction, !realmode); } - -static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); - - return ret; -} #endif static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, @@ -1981,8 +1959,6 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda1_iommu_ops = { .set = pnv_ioda1_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda1_tce_xchg, - .exchange_rm = pnv_ioda1_tce_xchg_rm, .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, .tce_kill = pnv_pci_p7ioc_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, @@ -2113,30 +2089,6 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, return ret; } -#ifdef CONFIG_IOMMU_API -static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); - - return ret; -} -#endif - static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { @@ -2148,8 +2100,6 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda2_tce_xchg, - .exchange_rm = pnv_ioda2_tce_xchg_rm, .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, .tce_kill = pnv_pci_ioda2_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, -- cgit v1.2.3 From 70ed86f4de5bd74dd2d884dcd2f3275c4cfe665f Mon Sep 17 00:00:00 2001 From: Claudio Carvalho Date: Thu, 29 Aug 2019 12:50:20 -0300 Subject: powerpc: Add PowerPC Capabilities ELF note Add the PowerPC name and the PPC_ELFNOTE_CAPABILITIES type in the kernel binary ELF note. This type is a bitmap that can be used to advertise kernel capabilities to userland. This patch also defines PPCCAP_ULTRAVISOR_BIT as being the bit zero. Suggested-by: Paul Mackerras Signed-off-by: Claudio Carvalho [ maxiwell: Define the 'PowerPC' type in the elfnote.h ] Signed-off-by: Maxiwell S. Garcia Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829155021.2915-2-maxiwell@linux.ibm.com --- arch/powerpc/include/asm/elfnote.h | 24 +++++++++++++++++++++++ arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/note.S | 40 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/elfnote.h create mode 100644 arch/powerpc/kernel/note.S diff --git a/arch/powerpc/include/asm/elfnote.h b/arch/powerpc/include/asm/elfnote.h new file mode 100644 index 000000000000..a201b6e9ae44 --- /dev/null +++ b/arch/powerpc/include/asm/elfnote.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PowerPC ELF notes. + * + * Copyright 2019, IBM Corporation + */ + +#ifndef __ASM_POWERPC_ELFNOTE_H__ +#define __ASM_POWERPC_ELFNOTE_H__ + +/* + * These note types should live in a SHT_NOTE segment and have + * "PowerPC" in the name field. + */ + +/* + * The capabilities supported/required by this kernel (bitmap). + * + * This type uses a bitmap as "desc" field. Each bit is described + * in arch/powerpc/kernel/note.S + */ +#define PPC_ELFNOTE_CAPABILITIES 1 + +#endif /* __ASM_POWERPC_ELFNOTE_H__ */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ea0c69236789..d4eb50de13b1 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -53,7 +53,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ dma-common.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ - paca.o nvram_64.o firmware.o + paca.o nvram_64.o firmware.o note.o obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/powerpc/kernel/note.S b/arch/powerpc/kernel/note.S new file mode 100644 index 000000000000..bcdad15395dd --- /dev/null +++ b/arch/powerpc/kernel/note.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PowerPC ELF notes. + * + * Copyright 2019, IBM Corporation + */ + +#include +#include + +/* + * Ultravisor-capable bit (PowerNV only). + * + * Bit 0 indicates that the powerpc kernel binary knows how to run in an + * ultravisor-enabled system. + * + * In an ultravisor-enabled system, some machine resources are now controlled + * by the ultravisor. If the kernel is not ultravisor-capable, but it ends up + * being run on a machine with ultravisor, the kernel will probably crash + * trying to access ultravisor resources. For instance, it may crash in early + * boot trying to set the partition table entry 0. + * + * In an ultravisor-enabled system, a bootloader could warn the user or prevent + * the kernel from being run if the PowerPC ultravisor capability doesn't exist + * or the Ultravisor-capable bit is not set. + */ +#ifdef CONFIG_PPC_POWERNV +#define PPCCAP_ULTRAVISOR_BIT (1 << 0) +#else +#define PPCCAP_ULTRAVISOR_BIT 0 +#endif + +/* + * Add the PowerPC Capabilities in the binary ELF note. It is a bitmap that + * can be used to advertise kernel capabilities to userland. + */ +#define PPC_CAPABILITIES_BITMAP (PPCCAP_ULTRAVISOR_BIT) + +ELFNOTE(PowerPC, PPC_ELFNOTE_CAPABILITIES, + .long PPC_CAPABILITIES_BITMAP) -- cgit v1.2.3 From 134cb3ab71c0736da0ec66d2cfd7f6ebba96c8cf Mon Sep 17 00:00:00 2001 From: "Maxiwell S. Garcia" Date: Thu, 29 Aug 2019 12:50:21 -0300 Subject: Documentation/powerpc: Add ELF note documentation The ELF note documentation describes the types and descriptors to be used with the PowerPC namespace. Signed-off-by: Maxiwell S. Garcia Signed-off-by: Claudio Carvalho Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190829155021.2915-3-maxiwell@linux.ibm.com --- Documentation/powerpc/elfnote.rst | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 Documentation/powerpc/elfnote.rst diff --git a/Documentation/powerpc/elfnote.rst b/Documentation/powerpc/elfnote.rst new file mode 100644 index 000000000000..2a5c4beeb809 --- /dev/null +++ b/Documentation/powerpc/elfnote.rst @@ -0,0 +1,42 @@ +========================== +ELF Note PowerPC Namespace +========================== + +The PowerPC namespace in an ELF Note of the kernel binary is used to store +capabilities and information which can be used by a bootloader or userland. + +Types and Descriptors +--------------------- + +The types to be used with the "PowerPC" namesapce are defined in the +include/uapi/asm/elfnote.h + + 1) PPC_ELFNOTE_CAPABILITIES + +Define the capabilities supported/required by the kernel. This type uses a +bitmap as "descriptor" field. Each bit is described below: + +- Ultravisor-capable bit (PowerNV only). + + #define PPCCAP_ULTRAVISOR_BIT (1 << 0) + + Indicate that the powerpc kernel binary knows how to run in an + ultravisor-enabled system. + + In an ultravisor-enabled system, some machine resources are now controlled + by the ultravisor. If the kernel is not ultravisor-capable, but it ends up + being run on a machine with ultravisor, the kernel will probably crash + trying to access ultravisor resources. For instance, it may crash in early + boot trying to set the partition table entry 0. + + In an ultravisor-enabled system, a bootloader could warn the user or prevent + the kernel from being run if the PowerPC ultravisor capability doesn't exist + or the Ultravisor-capable bit is not set. + +References +---------- + +arch/powerpc/include/asm/elfnote.h +arch/powerpc/kernel/note.S + + -- cgit v1.2.3 From 250c6c31228d49f3b96855ec387cf37bbe7cb6a7 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 22 Aug 2019 00:48:32 -0300 Subject: Documentation/powerpc: Ultravisor API Protected Execution Facility (PEF) is an architectural change for POWER 9 that enables Secure Virtual Machines (SVMs). When enabled, PEF adds a new higher privileged mode, called Ultravisor mode, to POWER architecture. Along with the new mode there is new firmware called the Protected Execution Ultravisor (or Ultravisor for short). POWER 9 DD2.3 chips (PVR=0x004e1203) or greater will be PEF-capable. Attached documentation provides an overview of PEF and defines the API for various interfaces that must be implemented in the Ultravisor firmware as well as in the KVM Hypervisor. Based on input from Mike Anderson, Thiago Bauermann, Claudio Carvalho, Ben Herrenschmidt, Guerney Hunt, Paul Mackerras. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Ram Pai Signed-off-by: Guerney Hunt Reviewed-by: Claudio Carvalho Reviewed-by: Michael Anderson Reviewed-by: Thiago Bauermann Signed-off-by: Claudio Carvalho Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190822034838.27876-2-cclaudio@linux.ibm.com --- Documentation/powerpc/ultravisor.rst | 1057 ++++++++++++++++++++++++++++++++++ 1 file changed, 1057 insertions(+) create mode 100644 Documentation/powerpc/ultravisor.rst diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst new file mode 100644 index 000000000000..94a149f34ec3 --- /dev/null +++ b/Documentation/powerpc/ultravisor.rst @@ -0,0 +1,1057 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. _ultravisor: + +============================ +Protected Execution Facility +============================ + +.. contents:: + :depth: 3 + +.. sectnum:: + :depth: 3 + +Protected Execution Facility +############################ + + Protected Execution Facility (PEF) is an architectural change for + POWER 9 that enables Secure Virtual Machines (SVMs). DD2.3 chips + (PVR=0x004e1203) or greater will be PEF-capable. A new ISA release + will include the PEF RFC02487 changes. + + When enabled, PEF adds a new higher privileged mode, called Ultravisor + mode, to POWER architecture. Along with the new mode there is new + firmware called the Protected Execution Ultravisor (or Ultravisor + for short). Ultravisor mode is the highest privileged mode in POWER + architecture. + + +------------------+ + | Privilege States | + +==================+ + | Problem | + +------------------+ + | Supervisor | + +------------------+ + | Hypervisor | + +------------------+ + | Ultravisor | + +------------------+ + + PEF protects SVMs from the hypervisor, privileged users, and other + VMs in the system. SVMs are protected while at rest and can only be + executed by an authorized machine. All virtual machines utilize + hypervisor services. The Ultravisor filters calls between the SVMs + and the hypervisor to assure that information does not accidentally + leak. All hypercalls except H_RANDOM are reflected to the hypervisor. + H_RANDOM is not reflected to prevent the hypervisor from influencing + random values in the SVM. + + To support this there is a refactoring of the ownership of resources + in the CPU. Some of the resources which were previously hypervisor + privileged are now ultravisor privileged. + +Hardware +======== + + The hardware changes include the following: + + * There is a new bit in the MSR that determines whether the current + process is running in secure mode, MSR(S) bit 41. MSR(S)=1, process + is in secure mode, MSR(s)=0 process is in normal mode. + + * The MSR(S) bit can only be set by the Ultravisor. + + * HRFID cannot be used to set the MSR(S) bit. If the hypervisor needs + to return to a SVM it must use an ultracall. It can determine if + the VM it is returning to is secure. + + * There is a new Ultravisor privileged register, SMFCTRL, which has an + enable/disable bit SMFCTRL(E). + + * The privilege of a process is now determined by three MSR bits, + MSR(S, HV, PR). In each of the tables below the modes are listed + from least privilege to highest privilege. The higher privilege + modes can access all the resources of the lower privilege modes. + + **Secure Mode MSR Settings** + + +---+---+---+---------------+ + | S | HV| PR|Privilege | + +===+===+===+===============+ + | 1 | 0 | 1 | Problem | + +---+---+---+---------------+ + | 1 | 0 | 0 | Privileged(OS)| + +---+---+---+---------------+ + | 1 | 1 | 0 | Ultravisor | + +---+---+---+---------------+ + | 1 | 1 | 1 | Reserved | + +---+---+---+---------------+ + + **Normal Mode MSR Settings** + + +---+---+---+---------------+ + | S | HV| PR|Privilege | + +===+===+===+===============+ + | 0 | 0 | 1 | Problem | + +---+---+---+---------------+ + | 0 | 0 | 0 | Privileged(OS)| + +---+---+---+---------------+ + | 0 | 1 | 0 | Hypervisor | + +---+---+---+---------------+ + | 0 | 1 | 1 | Problem (Host)| + +---+---+---+---------------+ + + * Memory is partitioned into secure and normal memory. Only processes + that are running in secure mode can access secure memory. + + * The hardware does not allow anything that is not running secure to + access secure memory. This means that the Hypervisor cannot access + the memory of the SVM without using an ultracall (asking the + Ultravisor). The Ultravisor will only allow the hypervisor to see + the SVM memory encrypted. + + * I/O systems are not allowed to directly address secure memory. This + limits the SVMs to virtual I/O only. + + * The architecture allows the SVM to share pages of memory with the + hypervisor that are not protected with encryption. However, this + sharing must be initiated by the SVM. + + * When a process is running in secure mode all hypercalls + (syscall lev=1) go to the Ultravisor. + + * When a process is in secure mode all interrupts go to the + Ultravisor. + + * The following resources have become Ultravisor privileged and + require an Ultravisor interface to manipulate: + + * Processor configurations registers (SCOMs). + + * Stop state information. + + * The debug registers CIABR, DAWR, and DAWRX when SMFCTRL(D) is set. + If SMFCTRL(D) is not set they do not work in secure mode. When set, + reading and writing requires an Ultravisor call, otherwise that + will cause a Hypervisor Emulation Assistance interrupt. + + * PTCR and partition table entries (partition table is in secure + memory). An attempt to write to PTCR will cause a Hypervisor + Emulation Assitance interrupt. + + * LDBAR (LD Base Address Register) and IMC (In-Memory Collection) + non-architected registers. An attempt to write to them will cause a + Hypervisor Emulation Assistance interrupt. + + * Paging for an SVM, sharing of memory with Hypervisor for an SVM. + (Including Virtual Processor Area (VPA) and virtual I/O). + + +Software/Microcode +================== + + The software changes include: + + * SVMs are created from normal VM using (open source) tooling supplied + by IBM. + + * All SVMs start as normal VMs and utilize an ultracall, UV_ESM + (Enter Secure Mode), to make the transition. + + * When the UV_ESM ultracall is made the Ultravisor copies the VM into + secure memory, decrypts the verification information, and checks the + integrity of the SVM. If the integrity check passes the Ultravisor + passes control in secure mode. + + * The verification information includes the pass phrase for the + encrypted disk associated with the SVM. This pass phrase is given + to the SVM when requested. + + * The Ultravisor is not involved in protecting the encrypted disk of + the SVM while at rest. + + * For external interrupts the Ultravisor saves the state of the SVM, + and reflects the interrupt to the hypervisor for processing. + For hypercalls, the Ultravisor inserts neutral state into all + registers not needed for the hypercall then reflects the call to + the hypervisor for processing. The H_RANDOM hypercall is performed + by the Ultravisor and not reflected. + + * For virtual I/O to work bounce buffering must be done. + + * The Ultravisor uses AES (IAPM) for protection of SVM memory. IAPM + is a mode of AES that provides integrity and secrecy concurrently. + + * The movement of data between normal and secure pages is coordinated + with the Ultravisor by a new HMM plug-in in the Hypervisor. + + The Ultravisor offers new services to the hypervisor and SVMs. These + are accessed through ultracalls. + +Terminology +=========== + + * Hypercalls: special system calls used to request services from + Hypervisor. + + * Normal memory: Memory that is accessible to Hypervisor. + + * Normal page: Page backed by normal memory and available to + Hypervisor. + + * Shared page: A page backed by normal memory and available to both + the Hypervisor/QEMU and the SVM (i.e page has mappings in SVM and + Hypervisor/QEMU). + + * Secure memory: Memory that is accessible only to Ultravisor and + SVMs. + + * Secure page: Page backed by secure memory and only available to + Ultravisor and SVM. + + * SVM: Secure Virtual Machine. + + * Ultracalls: special system calls used to request services from + Ultravisor. + + +Ultravisor calls API +#################### + + This section describes Ultravisor calls (ultracalls) needed to + support Secure Virtual Machines (SVM)s and Paravirtualized KVM. The + ultracalls allow the SVMs and Hypervisor to request services from the + Ultravisor such as accessing a register or memory region that can only + be accessed when running in Ultravisor-privileged mode. + + The specific service needed from an ultracall is specified in register + R3 (the first parameter to the ultracall). Other parameters to the + ultracall, if any, are specified in registers R4 through R12. + + Return value of all ultracalls is in register R3. Other output values + from the ultracall, if any, are returned in registers R4 through R12. + The only exception to this register usage is the ``UV_RETURN`` + ultracall described below. + + Each ultracall returns specific error codes, applicable in the context + of the ultracall. However, like with the PowerPC Architecture Platform + Reference (PAPR), if no specific error code is defined for a + particular situation, then the ultracall will fallback to an erroneous + parameter-position based code. i.e U_PARAMETER, U_P2, U_P3 etc + depending on the ultracall parameter that may have caused the error. + + Some ultracalls involve transferring a page of data between Ultravisor + and Hypervisor. Secure pages that are transferred from secure memory + to normal memory may be encrypted using dynamically generated keys. + When the secure pages are transferred back to secure memory, they may + be decrypted using the same dynamically generated keys. Generation and + management of these keys will be covered in a separate document. + + For now this only covers ultracalls currently implemented and being + used by Hypervisor and SVMs but others can be added here when it + makes sense. + + The full specification for all hypercalls/ultracalls will eventually + be made available in the public/OpenPower version of the PAPR + specification. + + **Note** + + If PEF is not enabled, the ultracalls will be redirected to the + Hypervisor which must handle/fail the calls. + +Ultracalls used by Hypervisor +============================= + + This section describes the virtual memory management ultracalls used + by the Hypervisor to manage SVMs. + +UV_PAGE_OUT +----------- + + Encrypt and move the contents of a page from secure memory to normal + memory. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_PAGE_OUT, + uint16_t lpid, /* LPAR ID */ + uint64_t dest_ra, /* real address of destination page */ + uint64_t src_gpa, /* source guest-physical-address */ + uint8_t flags, /* flags */ + uint64_t order) /* page size order */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_PARAMETER if ``lpid`` is invalid. + * U_P2 if ``dest_ra`` is invalid. + * U_P3 if the ``src_gpa`` address is invalid. + * U_P4 if any bit in the ``flags`` is unrecognized + * U_P5 if the ``order`` parameter is unsupported. + * U_FUNCTION if functionality is not supported. + * U_BUSY if page cannot be currently paged-out. + +Description +~~~~~~~~~~~ + + Encrypt the contents of a secure-page and make it available to + Hypervisor in a normal page. + + By default, the source page is unmapped from the SVM's partition- + scoped page table. But the Hypervisor can provide a hint to the + Ultravisor to retain the page mapping by setting the ``UV_SNAPSHOT`` + flag in ``flags`` parameter. + + If the source page is already a shared page the call returns + U_SUCCESS, without doing anything. + +Use cases +~~~~~~~~~ + + #. QEMU attempts to access an address belonging to the SVM but the + page frame for that address is not mapped into QEMU's address + space. In this case, the Hypervisor will allocate a page frame, + map it into QEMU's address space and issue the ``UV_PAGE_OUT`` + call to retrieve the encrypted contents of the page. + + #. When Ultravisor runs low on secure memory and it needs to page-out + an LRU page. In this case, Ultravisor will issue the + ``H_SVM_PAGE_OUT`` hypercall to the Hypervisor. The Hypervisor will + then allocate a normal page and issue the ``UV_PAGE_OUT`` ultracall + and the Ultravisor will encrypt and move the contents of the secure + page into the normal page. + + #. When Hypervisor accesses SVM data, the Hypervisor requests the + Ultravisor to transfer the corresponding page into a insecure page, + which the Hypervisor can access. The data in the normal page will + be encrypted though. + +UV_PAGE_IN +---------- + + Move the contents of a page from normal memory to secure memory. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_PAGE_IN, + uint16_t lpid, /* the LPAR ID */ + uint64_t src_ra, /* source real address of page */ + uint64_t dest_gpa, /* destination guest physical address */ + uint64_t flags, /* flags */ + uint64_t order) /* page size order */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_BUSY if page cannot be currently paged-in. + * U_FUNCTION if functionality is not supported + * U_PARAMETER if ``lpid`` is invalid. + * U_P2 if ``src_ra`` is invalid. + * U_P3 if the ``dest_gpa`` address is invalid. + * U_P4 if any bit in the ``flags`` is unrecognized + * U_P5 if the ``order`` parameter is unsupported. + +Description +~~~~~~~~~~~ + + Move the contents of the page identified by ``src_ra`` from normal + memory to secure memory and map it to the guest physical address + ``dest_gpa``. + + If `dest_gpa` refers to a shared address, map the page into the + partition-scoped page-table of the SVM. If `dest_gpa` is not shared, + copy the contents of the page into the corresponding secure page. + Depending on the context, decrypt the page before being copied. + + The caller provides the attributes of the page through the ``flags`` + parameter. Valid values for ``flags`` are: + + * CACHE_INHIBITED + * CACHE_ENABLED + * WRITE_PROTECTION + + The Hypervisor must pin the page in memory before making + ``UV_PAGE_IN`` ultracall. + +Use cases +~~~~~~~~~ + + #. When a normal VM switches to secure mode, all its pages residing + in normal memory, are moved into secure memory. + + #. When an SVM requests to share a page with Hypervisor the Hypervisor + allocates a page and informs the Ultravisor. + + #. When an SVM accesses a secure page that has been paged-out, + Ultravisor invokes the Hypervisor to locate the page. After + locating the page, the Hypervisor uses UV_PAGE_IN to make the + page available to Ultravisor. + +UV_PAGE_INVAL +------------- + + Invalidate the Ultravisor mapping of a page. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_PAGE_INVAL, + uint16_t lpid, /* the LPAR ID */ + uint64_t guest_pa, /* destination guest-physical-address */ + uint64_t order) /* page size order */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_PARAMETER if ``lpid`` is invalid. + * U_P2 if ``guest_pa`` is invalid (or corresponds to a secure + page mapping). + * U_P3 if the ``order`` is invalid. + * U_FUNCTION if functionality is not supported. + * U_BUSY if page cannot be currently invalidated. + +Description +~~~~~~~~~~~ + + This ultracall informs Ultravisor that the page mapping in Hypervisor + corresponding to the given guest physical address has been invalidated + and that the Ultravisor should not access the page. If the specified + ``guest_pa`` corresponds to a secure page, Ultravisor will ignore the + attempt to invalidate the page and return U_P2. + +Use cases +~~~~~~~~~ + + #. When a shared page is unmapped from the QEMU's page table, possibly + because it is paged-out to disk, Ultravisor needs to know that the + page should not be accessed from its side too. + + +UV_WRITE_PATE +------------- + + Validate and write the partition table entry (PATE) for a given + partition. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_WRITE_PATE, + uint32_t lpid, /* the LPAR ID */ + uint64_t dw0 /* the first double word to write */ + uint64_t dw1) /* the second double word to write */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_BUSY if PATE cannot be currently written to. + * U_FUNCTION if functionality is not supported. + * U_PARAMETER if ``lpid`` is invalid. + * U_P2 if ``dw0`` is invalid. + * U_P3 if the ``dw1`` address is invalid. + * U_PERMISSION if the Hypervisor is attempting to change the PATE + of a secure virtual machine or if called from a + context other than Hypervisor. + +Description +~~~~~~~~~~~ + + Validate and write a LPID and its partition-table-entry for the given + LPID. If the LPID is already allocated and initialized, this call + results in changing the partition table entry. + +Use cases +~~~~~~~~~ + + #. The Partition table resides in Secure memory and its entries, + called PATE (Partition Table Entries), point to the partition- + scoped page tables for the Hypervisor as well as each of the + virtual machines (both secure and normal). The Hypervisor + operates in partition 0 and its partition-scoped page tables + reside in normal memory. + + #. This ultracall allows the Hypervisor to register the partition- + scoped and process-scoped page table entries for the Hypervisor + and other partitions (virtual machines) with the Ultravisor. + + #. If the value of the PATE for an existing partition (VM) changes, + the TLB cache for the partition is flushed. + + #. The Hypervisor is responsible for allocating LPID. The LPID and + its PATE entry are registered together. The Hypervisor manages + the PATE entries for a normal VM and can change the PATE entry + anytime. Ultravisor manages the PATE entries for an SVM and + Hypervisor is not allowed to modify them. + +UV_RETURN +--------- + + Return control from the Hypervisor back to the Ultravisor after + processing an hypercall or interrupt that was forwarded (aka + *reflected*) to the Hypervisor. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_RETURN) + +Return values +~~~~~~~~~~~~~ + + This call never returns to Hypervisor on success. It returns + U_INVALID if ultracall is not made from a Hypervisor context. + +Description +~~~~~~~~~~~ + + When an SVM makes an hypercall or incurs some other exception, the + Ultravisor usually forwards (aka *reflects*) the exceptions to the + Hypervisor. After processing the exception, Hypervisor uses the + ``UV_RETURN`` ultracall to return control back to the SVM. + + The expected register state on entry to this ultracall is: + + * Non-volatile registers are restored to their original values. + * If returning from an hypercall, register R0 contains the return + value (**unlike other ultracalls**) and, registers R4 through R12 + contain any output values of the hypercall. + * R3 contains the ultracall number, i.e UV_RETURN. + * If returning with a synthesized interrupt, R2 contains the + synthesized interrupt number. + +Use cases +~~~~~~~~~ + + #. Ultravisor relies on the Hypervisor to provide several services to + the SVM such as processing hypercall and other exceptions. After + processing the exception, Hypervisor uses UV_RETURN to return + control back to the Ultravisor. + + #. Hypervisor has to use this ultracall to return control to the SVM. + + +UV_REGISTER_MEM_SLOT +-------------------- + + Register an SVM address-range with specified properties. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_REGISTER_MEM_SLOT, + uint64_t lpid, /* LPAR ID of the SVM */ + uint64_t start_gpa, /* start guest physical address */ + uint64_t size, /* size of address range in bytes */ + uint64_t flags /* reserved for future expansion */ + uint16_t slotid) /* slot identifier */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_PARAMETER if ``lpid`` is invalid. + * U_P2 if ``start_gpa`` is invalid. + * U_P3 if ``size`` is invalid. + * U_P4 if any bit in the ``flags`` is unrecognized. + * U_P5 if the ``slotid`` parameter is unsupported. + * U_PERMISSION if called from context other than Hypervisor. + * U_FUNCTION if functionality is not supported. + + +Description +~~~~~~~~~~~ + + Register a memory range for an SVM. The memory range starts at the + guest physical address ``start_gpa`` and is ``size`` bytes long. + +Use cases +~~~~~~~~~ + + + #. When a virtual machine goes secure, all the memory slots managed by + the Hypervisor move into secure memory. The Hypervisor iterates + through each of memory slots, and registers the slot with + Ultravisor. Hypervisor may discard some slots such as those used + for firmware (SLOF). + + #. When new memory is hot-plugged, a new memory slot gets registered. + + +UV_UNREGISTER_MEM_SLOT +---------------------- + + Unregister an SVM address-range that was previously registered using + UV_REGISTER_MEM_SLOT. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_UNREGISTER_MEM_SLOT, + uint64_t lpid, /* LPAR ID of the SVM */ + uint64_t slotid) /* reservation slotid */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_FUNCTION if functionality is not supported. + * U_PARAMETER if ``lpid`` is invalid. + * U_P2 if ``slotid`` is invalid. + * U_PERMISSION if called from context other than Hypervisor. + +Description +~~~~~~~~~~~ + + Release the memory slot identified by ``slotid`` and free any + resources allocated towards the reservation. + +Use cases +~~~~~~~~~ + + #. Memory hot-remove. + + +UV_SVM_TERMINATE +---------------- + + Terminate an SVM and release its resources. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_SVM_TERMINATE, + uint64_t lpid, /* LPAR ID of the SVM */) + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_FUNCTION if functionality is not supported. + * U_PARAMETER if ``lpid`` is invalid. + * U_INVALID if VM is not secure. + * U_PERMISSION if not called from a Hypervisor context. + +Description +~~~~~~~~~~~ + + Terminate an SVM and release all its resources. + +Use cases +~~~~~~~~~ + + #. Called by Hypervisor when terminating an SVM. + + +Ultracalls used by SVM +====================== + +UV_SHARE_PAGE +------------- + + Share a set of guest physical pages with the Hypervisor. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_SHARE_PAGE, + uint64_t gfn, /* guest page frame number */ + uint64_t num) /* number of pages of size PAGE_SIZE */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_FUNCTION if functionality is not supported. + * U_INVALID if the VM is not secure. + * U_PARAMETER if ``gfn`` is invalid. + * U_P2 if ``num`` is invalid. + +Description +~~~~~~~~~~~ + + Share the ``num`` pages starting at guest physical frame number ``gfn`` + with the Hypervisor. Assume page size is PAGE_SIZE bytes. Zero the + pages before returning. + + If the address is already backed by a secure page, unmap the page and + back it with an insecure page, with the help of the Hypervisor. If it + is not backed by any page yet, mark the PTE as insecure and back it + with an insecure page when the address is accessed. If it is already + backed by an insecure page, zero the page and return. + +Use cases +~~~~~~~~~ + + #. The Hypervisor cannot access the SVM pages since they are backed by + secure pages. Hence an SVM must explicitly request Ultravisor for + pages it can share with Hypervisor. + + #. Shared pages are needed to support virtio and Virtual Processor Area + (VPA) in SVMs. + + +UV_UNSHARE_PAGE +--------------- + + Restore a shared SVM page to its initial state. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_UNSHARE_PAGE, + uint64_t gfn, /* guest page frame number */ + uint73 num) /* number of pages of size PAGE_SIZE*/ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_FUNCTION if functionality is not supported. + * U_INVALID if VM is not secure. + * U_PARAMETER if ``gfn`` is invalid. + * U_P2 if ``num`` is invalid. + +Description +~~~~~~~~~~~ + + Stop sharing ``num`` pages starting at ``gfn`` with the Hypervisor. + Assume that the page size is PAGE_SIZE. Zero the pages before + returning. + + If the address is already backed by an insecure page, unmap the page + and back it with a secure page. Inform the Hypervisor to release + reference to its shared page. If the address is not backed by a page + yet, mark the PTE as secure and back it with a secure page when that + address is accessed. If it is already backed by an secure page zero + the page and return. + +Use cases +~~~~~~~~~ + + #. The SVM may decide to unshare a page from the Hypervisor. + + +UV_UNSHARE_ALL_PAGES +-------------------- + + Unshare all pages the SVM has shared with Hypervisor. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_UNSHARE_ALL_PAGES) + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success. + * U_FUNCTION if functionality is not supported. + * U_INVAL if VM is not secure. + +Description +~~~~~~~~~~~ + + Unshare all shared pages from the Hypervisor. All unshared pages are + zeroed on return. Only pages explicitly shared by the SVM with the + Hypervisor (using UV_SHARE_PAGE ultracall) are unshared. Ultravisor + may internally share some pages with the Hypervisor without explicit + request from the SVM. These pages will not be unshared by this + ultracall. + +Use cases +~~~~~~~~~ + + #. This call is needed when ``kexec`` is used to boot a different + kernel. It may also be needed during SVM reset. + +UV_ESM +------ + + Secure the virtual machine (*enter secure mode*). + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t ultracall(const uint64_t UV_ESM, + uint64_t esm_blob_addr, /* location of the ESM blob */ + unint64_t fdt) /* Flattened device tree */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * U_SUCCESS on success (including if VM is already secure). + * U_FUNCTION if functionality is not supported. + * U_INVALID if VM is not secure. + * U_PARAMETER if ``esm_blob_addr`` is invalid. + * U_P2 if ``fdt`` is invalid. + * U_PERMISSION if any integrity checks fail. + * U_RETRY insufficient memory to create SVM. + * U_NO_KEY symmetric key unavailable. + +Description +~~~~~~~~~~~ + + Secure the virtual machine. On successful completion, return + control to the virtual machine at the address specified in the + ESM blob. + +Use cases +~~~~~~~~~ + + #. A normal virtual machine can choose to switch to a secure mode. + +Hypervisor Calls API +#################### + + This document describes the Hypervisor calls (hypercalls) that are + needed to support the Ultravisor. Hypercalls are services provided by + the Hypervisor to virtual machines and Ultravisor. + + Register usage for these hypercalls is identical to that of the other + hypercalls defined in the Power Architecture Platform Reference (PAPR) + document. i.e on input, register R3 identifies the specific service + that is being requested and registers R4 through R11 contain + additional parameters to the hypercall, if any. On output, register + R3 contains the return value and registers R4 through R9 contain any + other output values from the hypercall. + + This document only covers hypercalls currently implemented/planned + for Ultravisor usage but others can be added here when it makes sense. + + The full specification for all hypercalls/ultracalls will eventually + be made available in the public/OpenPower version of the PAPR + specification. + +Hypervisor calls to support Ultravisor +====================================== + + Following are the set of hypercalls needed to support Ultravisor. + +H_SVM_INIT_START +---------------- + + Begin the process of converting a normal virtual machine into an SVM. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t hypercall(const uint64_t H_SVM_INIT_START) + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * H_SUCCESS on success. + +Description +~~~~~~~~~~~ + + Initiate the process of securing a virtual machine. This involves + coordinating with the Ultravisor, using ultracalls, to allocate + resources in the Ultravisor for the new SVM, transferring the VM's + pages from normal to secure memory etc. When the process is + completed, Ultravisor issues the H_SVM_INIT_DONE hypercall. + +Use cases +~~~~~~~~~ + + #. Ultravisor uses this hypercall to inform Hypervisor that a VM + has initiated the process of switching to secure mode. + + +H_SVM_INIT_DONE +--------------- + + Complete the process of securing an SVM. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t hypercall(const uint64_t H_SVM_INIT_DONE) + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * H_SUCCESS on success. + * H_UNSUPPORTED if called from the wrong context (e.g. + from an SVM or before an H_SVM_INIT_START + hypercall). + +Description +~~~~~~~~~~~ + + Complete the process of securing a virtual machine. This call must + be made after a prior call to ``H_SVM_INIT_START`` hypercall. + +Use cases +~~~~~~~~~ + + On successfully securing a virtual machine, the Ultravisor informs + Hypervisor about it. Hypervisor can use this call to finish setting + up its internal state for this virtual machine. + + +H_SVM_PAGE_IN +------------- + + Move the contents of a page from normal memory to secure memory. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t hypercall(const uint64_t H_SVM_PAGE_IN, + uint64_t guest_pa, /* guest-physical-address */ + uint64_t flags, /* flags */ + uint64_t order) /* page size order */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * H_SUCCESS on success. + * H_PARAMETER if ``guest_pa`` is invalid. + * H_P2 if ``flags`` is invalid. + * H_P3 if ``order`` of page is invalid. + +Description +~~~~~~~~~~~ + + Retrieve the content of the page, belonging to the VM at the specified + guest physical address. + + Only valid value(s) in ``flags`` are: + + * H_PAGE_IN_SHARED which indicates that the page is to be shared + with the Ultravisor. + + * H_PAGE_IN_NONSHARED indicates that the UV is not anymore + interested in the page. Applicable if the page is a shared page. + + The ``order`` parameter must correspond to the configured page size. + +Use cases +~~~~~~~~~ + + #. When a normal VM becomes a secure VM (using the UV_ESM ultracall), + the Ultravisor uses this hypercall to move contents of each page of + the VM from normal memory to secure memory. + + #. Ultravisor uses this hypercall to ask Hypervisor to provide a page + in normal memory that can be shared between the SVM and Hypervisor. + + #. Ultravisor uses this hypercall to page-in a paged-out page. This + can happen when the SVM touches a paged-out page. + + #. If SVM wants to disable sharing of pages with Hypervisor, it can + inform Ultravisor to do so. Ultravisor will then use this hypercall + and inform Hypervisor that it has released access to the normal + page. + +H_SVM_PAGE_OUT +--------------- + + Move the contents of the page to normal memory. + +Syntax +~~~~~~ + +.. code-block:: c + + uint64_t hypercall(const uint64_t H_SVM_PAGE_OUT, + uint64_t guest_pa, /* guest-physical-address */ + uint64_t flags, /* flags (currently none) */ + uint64_t order) /* page size order */ + +Return values +~~~~~~~~~~~~~ + + One of the following values: + + * H_SUCCESS on success. + * H_PARAMETER if ``guest_pa`` is invalid. + * H_P2 if ``flags`` is invalid. + * H_P3 if ``order`` is invalid. + +Description +~~~~~~~~~~~ + + Move the contents of the page identified by ``guest_pa`` to normal + memory. + + Currently ``flags`` is unused and must be set to 0. The ``order`` + parameter must correspond to the configured page size. + +Use cases +~~~~~~~~~ + + #. If Ultravisor is running low on secure pages, it can move the + contents of some secure pages, into normal pages using this + hypercall. The content will be encrypted. + +References +########## + +.. [1] `Supporting Protected Computing on IBM Power Architecture `_ -- cgit v1.2.3 From a49dddbdb0cca1d00fc9251e543a0aac09a6a65b Mon Sep 17 00:00:00 2001 From: Claudio Carvalho Date: Thu, 22 Aug 2019 00:48:33 -0300 Subject: powerpc/kernel: Add ucall_norets() ultravisor call handler The ultracalls (ucalls for short) allow the Secure Virtual Machines (SVM)s and hypervisor to request services from the ultravisor such as accessing a register or memory region that can only be accessed when running in ultravisor-privileged mode. This patch adds the ucall_norets() ultravisor call handler. The specific service needed from an ucall is specified in register R3 (the first parameter to the ucall). Other parameters to the ucall, if any, are specified in registers R4 through R12. Return value of all ucalls is in register R3. Other output values from the ucall, if any, are returned in registers R4 through R12. Each ucall returns specific error codes, applicable in the context of the ucall. However, like with the PowerPC Architecture Platform Reference (PAPR), if no specific error code is defined for a particular situation, then the ucall will fallback to an erroneous parameter-position based code. i.e U_PARAMETER, U_P2, U_P3 etc depending on the ucall parameter that may have caused the error. Every host kernel (powernv) needs to be able to do ucalls in case it ends up being run in a machine with ultravisor enabled. Otherwise, the kernel may crash early in boot trying to access ultravisor resources, for instance, trying to set the partition table entry 0. Secure guests also need to be able to do ucalls and its kernel may not have CONFIG_PPC_POWERNV=y. For that reason, the ucall.S file is placed under arch/powerpc/kernel. If ultravisor is not enabled, the ucalls will be redirected to the hypervisor which must handle/fail the call. Thanks to inputs from Ram Pai and Michael Anderson. Signed-off-by: Claudio Carvalho Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190822034838.27876-3-cclaudio@linux.ibm.com --- arch/powerpc/include/asm/asm-prototypes.h | 11 +++++++++++ arch/powerpc/include/asm/ultravisor-api.h | 23 +++++++++++++++++++++++ arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/ucall.S | 14 ++++++++++++++ 4 files changed, 49 insertions(+) create mode 100644 arch/powerpc/include/asm/ultravisor-api.h create mode 100644 arch/powerpc/kernel/ucall.S diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index ec1c97a8e8cb..e698f48cbc6d 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -34,6 +35,16 @@ extern struct static_key hcall_tracepoint_key; void __trace_hcall_entry(unsigned long opcode, unsigned long *args); void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); +/* Ultravisor */ +#ifdef CONFIG_PPC_POWERNV +long ucall_norets(unsigned long opcode, ...); +#else +static inline long ucall_norets(unsigned long opcode, ...) +{ + return U_NOT_AVAILABLE; +} +#endif + /* OPAL */ int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, int64_t a4, int64_t a5, int64_t a6, int64_t a7, diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h new file mode 100644 index 000000000000..88ffa78f9d61 --- /dev/null +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ultravisor API. + * + * Copyright 2019, IBM Corporation. + * + */ +#ifndef _ASM_POWERPC_ULTRAVISOR_API_H +#define _ASM_POWERPC_ULTRAVISOR_API_H + +#include + +/* Return codes */ +#define U_FUNCTION H_FUNCTION +#define U_NOT_AVAILABLE H_NOT_AVAILABLE +#define U_P2 H_P2 +#define U_P3 H_P3 +#define U_P4 H_P4 +#define U_P5 H_P5 +#define U_PARAMETER H_PARAMETER +#define U_SUCCESS H_SUCCESS + +#endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index d4eb50de13b1..934e64b28894 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -156,6 +156,7 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o +obj-$(CONFIG_PPC_POWERNV) += ucall.o # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n diff --git a/arch/powerpc/kernel/ucall.S b/arch/powerpc/kernel/ucall.S new file mode 100644 index 000000000000..07296bc39166 --- /dev/null +++ b/arch/powerpc/kernel/ucall.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Generic code to perform an ultravisor call. + * + * Copyright 2019, IBM Corporation. + * + */ +#include +#include + +_GLOBAL(ucall_norets) +EXPORT_SYMBOL_GPL(ucall_norets) + sc 2 /* Invoke the ultravisor */ + blr /* Return r3 = status */ -- cgit v1.2.3 From bb04ffe85eebebd64d5e673a9434d968e80f3aa1 Mon Sep 17 00:00:00 2001 From: Claudio Carvalho Date: Thu, 22 Aug 2019 00:48:34 -0300 Subject: powerpc/powernv: Introduce FW_FEATURE_ULTRAVISOR In PEF enabled systems, some of the resources which were previously hypervisor privileged are now ultravisor privileged and controlled by the ultravisor firmware. This adds FW_FEATURE_ULTRAVISOR to indicate if PEF is enabled. The host kernel can use FW_FEATURE_ULTRAVISOR, for instance, to skip accessing resources (e.g. PTCR and LDBAR) in case PEF is enabled. Signed-off-by: Claudio Carvalho [ andmike: Device node name to "ibm,ultravisor" ] Signed-off-by: Michael Anderson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190822034838.27876-4-cclaudio@linux.ibm.com --- arch/powerpc/include/asm/firmware.h | 5 +++-- arch/powerpc/include/asm/ultravisor.h | 14 ++++++++++++++ arch/powerpc/kernel/prom.c | 4 ++++ arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/ultravisor.c | 24 ++++++++++++++++++++++++ 5 files changed, 46 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/include/asm/ultravisor.h create mode 100644 arch/powerpc/platforms/powernv/ultravisor.c diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index faeca8b76c8c..b3e214a97f3a 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -50,6 +50,7 @@ #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) #define FW_FEATURE_BLOCK_REMOVE ASM_CONST(0x0000001000000000) #define FW_FEATURE_PAPR_SCM ASM_CONST(0x0000002000000000) +#define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) #ifndef __ASSEMBLY__ @@ -68,9 +69,9 @@ enum { FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | - FW_FEATURE_PAPR_SCM, + FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR, FW_FEATURE_PSERIES_ALWAYS = 0, - FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL, + FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h new file mode 100644 index 000000000000..dc6e1ea198f2 --- /dev/null +++ b/arch/powerpc/include/asm/ultravisor.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ultravisor definitions + * + * Copyright 2019, IBM Corporation. + * + */ +#ifndef _ASM_POWERPC_ULTRAVISOR_H +#define _ASM_POWERPC_ULTRAVISOR_H + +int early_init_dt_scan_ultravisor(unsigned long node, const char *uname, + int depth, void *data); + +#endif /* _ASM_POWERPC_ULTRAVISOR_H */ diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7159e791a70d..5828f1c81dc9 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -55,6 +55,7 @@ #include #include #include +#include #include @@ -702,6 +703,9 @@ void __init early_init_devtree(void *params) #ifdef CONFIG_PPC_POWERNV /* Some machines might need OPAL info for debugging, grab it now. */ of_scan_flat_dt(early_init_dt_scan_opal, NULL); + + /* Scan tree for ultravisor feature */ + of_scan_flat_dt(early_init_dt_scan_ultravisor, NULL); #endif #ifdef CONFIG_FA_DUMP diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index da2e99efbd04..2c27c8ac00c8 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -4,6 +4,7 @@ obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o +obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c new file mode 100644 index 000000000000..02ac57b4bded --- /dev/null +++ b/arch/powerpc/platforms/powernv/ultravisor.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ultravisor high level interfaces + * + * Copyright 2019, IBM Corporation. + * + */ +#include +#include +#include + +#include +#include + +int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname, + int depth, void *data) +{ + if (!of_flat_dt_is_compatible(node, "ibm,ultravisor")) + return 0; + + powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR; + pr_debug("Ultravisor detected!\n"); + return 1; +} -- cgit v1.2.3 From 139a1d2842ec181cf017502a46bb8d947682a960 Mon Sep 17 00:00:00 2001 From: Michael Anderson Date: Thu, 22 Aug 2019 00:48:35 -0300 Subject: powerpc/mm: Use UV_WRITE_PATE ucall to register a PATE When Ultravisor (UV) is enabled, the partition table is stored in secure memory and can only be accessed via the UV. The Hypervisor (HV) however maintains a copy of the partition table in normal memory to allow Nest MMU translations to occur (for normal VMs). The HV copy includes partition table entries (PATE)s for secure VMs which would currently be unused (Nest MMU translations cannot access secure memory) but they would be needed as we add functionality. This patch adds the UV_WRITE_PATE ucall which is used to update the PATE for a VM (both normal and secure) when Ultravisor is enabled. Signed-off-by: Michael Anderson Signed-off-by: Madhavan Srinivasan Signed-off-by: Ram Pai [ cclaudio: Write the PATE in HV's table before doing that in UV's ] Signed-off-by: Claudio Carvalho Reviewed-by: Ryan Grimm Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190822034838.27876-5-cclaudio@linux.ibm.com --- arch/powerpc/include/asm/ultravisor-api.h | 5 ++++ arch/powerpc/include/asm/ultravisor.h | 8 +++++ arch/powerpc/mm/book3s64/pgtable.c | 50 +++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h index 88ffa78f9d61..8cd49abff4f3 100644 --- a/arch/powerpc/include/asm/ultravisor-api.h +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -11,6 +11,7 @@ #include /* Return codes */ +#define U_BUSY H_BUSY #define U_FUNCTION H_FUNCTION #define U_NOT_AVAILABLE H_NOT_AVAILABLE #define U_P2 H_P2 @@ -18,6 +19,10 @@ #define U_P4 H_P4 #define U_P5 H_P5 #define U_PARAMETER H_PARAMETER +#define U_PERMISSION H_PERMISSION #define U_SUCCESS H_SUCCESS +/* opcodes */ +#define UV_WRITE_PATE 0xF104 + #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h index dc6e1ea198f2..6fe1f365dec8 100644 --- a/arch/powerpc/include/asm/ultravisor.h +++ b/arch/powerpc/include/asm/ultravisor.h @@ -8,7 +8,15 @@ #ifndef _ASM_POWERPC_ULTRAVISOR_H #define _ASM_POWERPC_ULTRAVISOR_H +#include +#include + int early_init_dt_scan_ultravisor(unsigned long node, const char *uname, int depth, void *data); +static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1) +{ + return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1); +} + #endif /* _ASM_POWERPC_ULTRAVISOR_H */ diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 7d0e0d0d22c4..4173f6931009 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -209,21 +211,10 @@ void __init mmu_partition_table_init(void) powernv_set_nmmu_ptcr(ptcr); } -void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) +static void flush_partition(unsigned int lpid, bool radix) { - unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); - - partition_tb[lpid].patb0 = cpu_to_be64(dw0); - partition_tb[lpid].patb1 = cpu_to_be64(dw1); - - /* - * Global flush of TLBs and partition table caches for this lpid. - * The type of flush (hash or radix) depends on what the previous - * use of this partition ID was, not the new use. - */ asm volatile("ptesync" : : : "memory"); - if (old & PATB_HR) { + if (radix) { asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : @@ -237,6 +228,39 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, /* do we need fixup here ?*/ asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } + +void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1) +{ + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + + /* + * When ultravisor is enabled, the partition table is stored in secure + * memory and can only be accessed doing an ultravisor call. However, we + * maintain a copy of the partition table in normal memory to allow Nest + * MMU translations to occur (for normal VMs). + * + * Therefore, here we always update partition_tb, regardless of whether + * we are running under an ultravisor or not. + */ + partition_tb[lpid].patb0 = cpu_to_be64(dw0); + partition_tb[lpid].patb1 = cpu_to_be64(dw1); + + /* + * If ultravisor is enabled, we do an ultravisor call to register the + * partition table entry (PATE), which also do a global flush of TLBs + * and partition table caches for the lpid. Otherwise, just do the + * flush. The type of flush (hash or radix) depends on what the previous + * use of the partition ID was, not the new use. + */ + if (firmware_has_feature(FW_FEATURE_ULTRAVISOR)) { + uv_register_pate(lpid, dw0, dw1); + pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n", + dw0, dw1); + } else { + flush_partition(lpid, (old & PATB_HR)); + } +} EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); static pmd_t *get_pmd_from_cache(struct mm_struct *mm) -- cgit v1.2.3 From 5223134029a87db925ecc9449f9501bad391a52e Mon Sep 17 00:00:00 2001 From: Claudio Carvalho Date: Thu, 22 Aug 2019 00:48:36 -0300 Subject: powerpc/mm: Write to PTCR only if ultravisor disabled In ultravisor enabled systems, PTCR becomes ultravisor privileged only for writing and an attempt to write to it will cause a Hypervisor Emulation Assitance interrupt. This patch uses the set_ptcr_when_no_uv() function to restrict PTCR writing to only when ultravisor is disabled. Signed-off-by: Claudio Carvalho Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190822034838.27876-6-cclaudio@linux.ibm.com --- arch/powerpc/include/asm/ultravisor.h | 12 ++++++++++++ arch/powerpc/mm/book3s64/hash_utils.c | 5 +++-- arch/powerpc/mm/book3s64/pgtable.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 8 +++++--- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h index 6fe1f365dec8..d7aa97aa7834 100644 --- a/arch/powerpc/include/asm/ultravisor.h +++ b/arch/powerpc/include/asm/ultravisor.h @@ -10,10 +10,22 @@ #include #include +#include int early_init_dt_scan_ultravisor(unsigned long node, const char *uname, int depth, void *data); +/* + * In ultravisor enabled systems, PTCR becomes ultravisor privileged only for + * writing and an attempt to write to it will cause a Hypervisor Emulation + * Assistance interrupt. + */ +static inline void set_ptcr_when_no_uv(u64 val) +{ + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_PTCR, val); +} + static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1) { return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b8ad14bb1170..88aab920caca 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -61,6 +61,7 @@ #include #include #include +#include #include @@ -1075,8 +1076,8 @@ void hash__early_init_mmu_secondary(void) if (!cpu_has_feature(CPU_FTR_ARCH_300)) mtspr(SPRN_SDR1, _SDR1); else - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); } /* Initialize SLB */ slb_initialize(); diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 4173f6931009..01a7570c10e0 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -207,7 +207,7 @@ void __init mmu_partition_table_init(void) * 64 K size. */ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); - mtspr(SPRN_PTCR, ptcr); + set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index b4ca9e95e678..5bc118b4a634 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -650,8 +651,9 @@ void radix__early_init_mmu_secondary(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); - mtspr(SPRN_PTCR, - __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + set_ptcr_when_no_uv(__pa(partition_tb) | + (PATB_SIZE_SHIFT - 12)); + radix_init_amor(); } @@ -667,7 +669,7 @@ void radix__mmu_cleanup_all(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) { lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); - mtspr(SPRN_PTCR, 0); + set_ptcr_when_no_uv(0); powernv_set_nmmu_ptcr(0); radix__flush_tlb_all(); } -- cgit v1.2.3 From 512a5a6452b6d742b6f713184414d28cb6413080 Mon Sep 17 00:00:00 2001 From: Claudio Carvalho Date: Thu, 22 Aug 2019 00:48:37 -0300 Subject: powerpc/powernv: Access LDBAR only if ultravisor disabled LDBAR is a per-thread SPR populated and used by the thread-imc pmu driver to dump the data counter into memory. It contains memory along with few other configuration bits. LDBAR is populated and enabled only when any of the thread imc pmu events are monitored. In ultravisor enabled systems, LDBAR becomes ultravisor privileged and an attempt to write to it will cause a Hypervisor Emulation Assistance interrupt. In ultravisor enabled systems, the ultravisor is responsible to maintain the LDBAR (e.g. save and restore it). This restricts LDBAR access to only when ultravisor is disabled. Signed-off-by: Claudio Carvalho Reviewed-by: Ram Pai Reviewed-by: Ryan Grimm Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190822034838.27876-7-cclaudio@linux.ibm.com --- arch/powerpc/platforms/powernv/idle.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 09f49eed7fb8..78599bca66c2 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -675,7 +675,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on) sprs.ptcr = mfspr(SPRN_PTCR); sprs.rpr = mfspr(SPRN_RPR); sprs.tscr = mfspr(SPRN_TSCR); - sprs.ldbar = mfspr(SPRN_LDBAR); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + sprs.ldbar = mfspr(SPRN_LDBAR); sprs_saved = true; @@ -789,7 +790,8 @@ core_woken: mtspr(SPRN_MMCR0, sprs.mmcr0); mtspr(SPRN_MMCR1, sprs.mmcr1); mtspr(SPRN_MMCR2, sprs.mmcr2); - mtspr(SPRN_LDBAR, sprs.ldbar); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_LDBAR, sprs.ldbar); mtspr(SPRN_SPRG3, local_paca->sprg_vdso); -- cgit v1.2.3 From 6c85b7bc637b64e681760f62c0eafba2f56745c6 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 22 Aug 2019 00:48:38 -0300 Subject: powerpc/kvm: Use UV_RETURN ucall to return to ultravisor When an SVM makes an hypercall or incurs some other exception, the Ultravisor usually forwards (a.k.a. reflects) the exceptions to the Hypervisor. After processing the exception, Hypervisor uses the UV_RETURN ultracall to return control back to the SVM. The expected register state on entry to this ultracall is: * Non-volatile registers are restored to their original values. * If returning from an hypercall, register R0 contains the return value (unlike other ultracalls) and, registers R4 through R12 contain any output values of the hypercall. * R3 contains the ultracall number, i.e UV_RETURN. * If returning with a synthesized interrupt, R2 contains the synthesized interrupt number. Thanks to input from Paul Mackerras, Ram Pai and Mike Anderson. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Claudio Carvalho Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190822034838.27876-8-cclaudio@linux.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/ultravisor-api.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 39 ++++++++++++++++++++++++++----- 4 files changed, 36 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e6e5f59aaa97..4bb552d639b8 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -283,6 +283,7 @@ struct kvm_arch { cpumask_t cpu_in_guest; u8 radix; u8 fwnmi_enabled; + u8 secure_guest; bool threads_indep; bool nested_enable; pgd_t *pgtable; diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h index 8cd49abff4f3..6a0f9c74f959 100644 --- a/arch/powerpc/include/asm/ultravisor-api.h +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -24,5 +24,6 @@ /* opcodes */ #define UV_WRITE_PATE 0xF104 +#define UV_RETURN 0xF11C #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4ccb6b3a7fbd..484f54dab247 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -506,6 +506,7 @@ int main(void) OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); OFFSET(KVM_RADIX, kvm, arch.radix); OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); + OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 07181d0dfcb7..9a05b0d932ef 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -29,6 +29,7 @@ #include #include #include +#include /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ @@ -1085,16 +1086,10 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r5, VCPU_LR(r4) - ld r6, VCPU_CR(r4) mtlr r5 - mtcr r6 ld r1, VCPU_GPR(R1)(r4) - ld r2, VCPU_GPR(R2)(r4) - ld r3, VCPU_GPR(R3)(r4) ld r5, VCPU_GPR(R5)(r4) - ld r6, VCPU_GPR(R6)(r4) - ld r7, VCPU_GPR(R7)(r4) ld r8, VCPU_GPR(R8)(r4) ld r9, VCPU_GPR(R9)(r4) ld r10, VCPU_GPR(R10)(r4) @@ -1112,10 +1107,42 @@ BEGIN_FTR_SECTION mtspr SPRN_HDSISR, r0 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r6, VCPU_KVM(r4) + lbz r7, KVM_SECURE_GUEST(r6) + cmpdi r7, 0 + ld r6, VCPU_GPR(R6)(r4) + ld r7, VCPU_GPR(R7)(r4) + bne ret_to_ultra + + lwz r0, VCPU_CR(r4) + mtcr r0 + ld r0, VCPU_GPR(R0)(r4) + ld r2, VCPU_GPR(R2)(r4) + ld r3, VCPU_GPR(R3)(r4) ld r4, VCPU_GPR(R4)(r4) HRFI_TO_GUEST b . +/* + * Use UV_RETURN ultracall to return control back to the Ultravisor after + * processing an hypercall or interrupt that was forwarded (a.k.a. reflected) + * to the Hypervisor. + * + * All registers have already been loaded, except: + * R0 = hcall result + * R2 = SRR1, so UV can detect a synthesized interrupt (if any) + * R3 = UV_RETURN + */ +ret_to_ultra: + lwz r0, VCPU_CR(r4) + mtcr r0 + + ld r0, VCPU_GPR(R3)(r4) + mfspr r2, SPRN_SRR1 + li r3, 0 + ori r3, r3, UV_RETURN + ld r4, VCPU_GPR(R4)(r4) + sc 2 /* * Enter the guest on a P9 or later system where we have exactly -- cgit v1.2.3 From dea45ea7775240047f70e2631e468f6b65d09493 Mon Sep 17 00:00:00 2001 From: Claudio Carvalho Date: Wed, 28 Aug 2019 23:05:20 +1000 Subject: powerpc/powernv/opal-msglog: Refactor memcons code This patch refactors the code in opal-msglog that operates on the OPAL memory console in order to make it cleaner and also allow the reuse of the new memcons_* functions. Signed-off-by: Claudio Carvalho Signed-off-by: Michael Ellerman Tested-by: Claudio Carvalho Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190828130521.26764-1-mpe@ellerman.id.au --- arch/powerpc/platforms/powernv/opal-msglog.c | 57 +++++++++++++++++++--------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index dc51d03c6370..d26da19a611f 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -29,23 +29,23 @@ struct memcons { static struct memcons *opal_memcons = NULL; -ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count) { const char *conbuf; ssize_t ret; size_t first_read = 0; uint32_t out_pos, avail; - if (!opal_memcons) + if (!mc) return -ENODEV; - out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos)); + out_pos = be32_to_cpu(READ_ONCE(mc->out_pos)); /* Now we've read out_pos, put a barrier in before reading the new * data it points to in conbuf. */ smp_rmb(); - conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys)); + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); /* When the buffer has wrapped, read from the out_pos marker to the end * of the buffer, and then read the remaining data as in the un-wrapped @@ -53,7 +53,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) if (out_pos & MEMCONS_OUT_POS_WRAP) { out_pos &= MEMCONS_OUT_POS_MASK; - avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos; + avail = be32_to_cpu(mc->obuf_size) - out_pos; ret = memory_read_from_buffer(to, count, &pos, conbuf + out_pos, avail); @@ -71,7 +71,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) } /* Sanity check. The firmware should not do this to us. */ - if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) { + if (out_pos > be32_to_cpu(mc->obuf_size)) { pr_err("OPAL: memory console corruption. Aborting read.\n"); return -EINVAL; } @@ -86,6 +86,11 @@ out: return ret; } +ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +{ + return memcons_copy(opal_memcons, to, pos, count); +} + static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, struct bin_attribute *bin_attr, char *to, loff_t pos, size_t count) @@ -98,32 +103,48 @@ static struct bin_attribute opal_msglog_attr = { .read = opal_msglog_read }; -void __init opal_msglog_init(void) +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name) { u64 mcaddr; struct memcons *mc; - if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { - pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); - return; + if (of_property_read_u64(node, mc_prop_name, &mcaddr)) { + pr_warn("%s property not found, no message log\n", + mc_prop_name); + goto out_err; } mc = phys_to_virt(mcaddr); if (!mc) { - pr_warn("OPAL: memory console address is invalid\n"); - return; + pr_warn("memory console address is invalid\n"); + goto out_err; } if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { - pr_warn("OPAL: memory console version is invalid\n"); - return; + pr_warn("memory console version is invalid\n"); + goto out_err; } - /* Report maximum size */ - opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) + - be32_to_cpu(mc->obuf_size); + return mc; + +out_err: + return NULL; +} + +u32 memcons_get_size(struct memcons *mc) +{ + return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size); +} + +void __init opal_msglog_init(void) +{ + opal_memcons = memcons_init(opal_node, "ibm,opal-memcons"); + if (!opal_memcons) { + pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n"); + return; + } - opal_memcons = mc; + opal_msglog_attr.size = memcons_get_size(opal_memcons); } void __init opal_msglog_sysfs_init(void) -- cgit v1.2.3 From 68e0aa8ec5cedec48dd5b11df84afc956c8f85be Mon Sep 17 00:00:00 2001 From: Claudio Carvalho Date: Wed, 28 Aug 2019 23:05:21 +1000 Subject: powerpc/powernv: Add ultravisor message log interface The ultravisor (UV) provides an in-memory console which follows the OPAL in-memory console structure. This patch extends the OPAL msglog code to initialize the UV memory console and provide the "/sys/firmware/ultravisor/msglog" interface for userspace to view the UV message log. Signed-off-by: Claudio Carvalho Signed-off-by: Michael Ellerman Tested-by: Claudio Carvalho Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190828130521.26764-2-mpe@ellerman.id.au --- arch/powerpc/platforms/powernv/powernv.h | 5 ++++ arch/powerpc/platforms/powernv/ultravisor.c | 45 +++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index fd4a1c5a6369..1aa51c4fa904 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -30,4 +30,9 @@ extern void opal_event_shutdown(void); bool cpu_core_split_required(void); +struct memcons; +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count); +u32 memcons_get_size(struct memcons *mc); +struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c index 02ac57b4bded..e4a00ad06f9d 100644 --- a/arch/powerpc/platforms/powernv/ultravisor.c +++ b/arch/powerpc/platforms/powernv/ultravisor.c @@ -8,9 +8,15 @@ #include #include #include +#include #include #include +#include + +#include "powernv.h" + +static struct kobject *ultravisor_kobj; int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname, int depth, void *data) @@ -22,3 +28,42 @@ int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname, pr_debug("Ultravisor detected!\n"); return 1; } + +static struct memcons *uv_memcons; + +static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + return memcons_copy(uv_memcons, to, pos, count); +} + +static struct bin_attribute uv_msglog_attr = { + .attr = {.name = "msglog", .mode = 0400}, + .read = uv_msglog_read +}; + +static int __init uv_init(void) +{ + struct device_node *node; + + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + return 0; + + node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware"); + if (!node) + return -ENODEV; + + uv_memcons = memcons_init(node, "memcons"); + if (!uv_memcons) + return -ENOENT; + + uv_msglog_attr.size = memcons_get_size(uv_memcons); + + ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj); + if (!ultravisor_kobj) + return -ENOMEM; + + return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr); +} +machine_subsys_initcall(powernv, uv_init); -- cgit v1.2.3 From 136bc0397ae21dbf63ca02e5775ad353a479cd2f Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Mon, 19 Aug 2019 23:13:12 -0300 Subject: powerpc/pseries: Introduce option to build secure virtual machines Introduce CONFIG_PPC_SVM to control support for secure guests and include Ultravisor-related helpers when it is selected Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-3-bauerman@linux.ibm.com --- arch/powerpc/include/asm/asm-prototypes.h | 2 +- arch/powerpc/kernel/Makefile | 4 +++- arch/powerpc/platforms/pseries/Kconfig | 11 +++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index e698f48cbc6d..49196d35e3bb 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -36,7 +36,7 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args); void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); /* Ultravisor */ -#ifdef CONFIG_PPC_POWERNV +#if defined(CONFIG_PPC_POWERNV) || defined(CONFIG_PPC_SVM) long ucall_norets(unsigned long opcode, ...); #else static inline long ucall_norets(unsigned long opcode, ...) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 934e64b28894..c6ae0e7914bc 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -156,7 +156,9 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o -obj-$(CONFIG_PPC_POWERNV) += ucall.o +ifneq ($(CONFIG_PPC_POWERNV)$(CONFIG_PPC_SVM),) +obj-y += ucall.o +endif # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f7b484f55553..d09deb05bb66 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -145,3 +145,14 @@ config PAPR_SCM tristate "Support for the PAPR Storage Class Memory interface" help Enable access to hypervisor provided storage class memory. + +config PPC_SVM + bool "Secure virtual machine (SVM) support for POWER" + depends on PPC_PSERIES + help + There are certain POWER platforms which support secure guests using + the Protected Execution Facility, with the help of an Ultravisor + executing below the hypervisor layer. This enables support for + those guests. + + If unsure, say "N". -- cgit v1.2.3 From 528229d210781b2da66c6d257a326c21099982b5 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 19 Aug 2019 23:13:13 -0300 Subject: powerpc: Add support for adding an ESM blob to the zImage wrapper For secure VMs, the signing tool will create a ticket called the "ESM blob" for the Enter Secure Mode ultravisor call with the signatures of the kernel and initrd among other things. This adds support to the wrapper script for adding that blob via the "-e" option to the zImage.pseries. It also adds code to the zImage wrapper itself to retrieve and if necessary relocate the blob, and pass its address to Linux via the device-tree, to be later consumed by prom_init. Signed-off-by: Benjamin Herrenschmidt [ bauerman: Minor adjustments to some comments. ] Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-4-bauerman@linux.ibm.com --- arch/powerpc/boot/main.c | 41 +++++++++++++++++++++++++++++++++++++++++ arch/powerpc/boot/ops.h | 2 ++ arch/powerpc/boot/wrapper | 24 +++++++++++++++++++++--- arch/powerpc/boot/zImage.lds.S | 8 ++++++++ 4 files changed, 72 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c index 102cc546444d..a9d209135975 100644 --- a/arch/powerpc/boot/main.c +++ b/arch/powerpc/boot/main.c @@ -146,6 +146,46 @@ static struct addr_range prep_initrd(struct addr_range vmlinux, void *chosen, return (struct addr_range){(void *)initrd_addr, initrd_size}; } +#ifdef __powerpc64__ +static void prep_esm_blob(struct addr_range vmlinux, void *chosen) +{ + unsigned long esm_blob_addr, esm_blob_size; + + /* Do we have an ESM (Enter Secure Mode) blob? */ + if (_esm_blob_end <= _esm_blob_start) + return; + + printf("Attached ESM blob at 0x%p-0x%p\n\r", + _esm_blob_start, _esm_blob_end); + esm_blob_addr = (unsigned long)_esm_blob_start; + esm_blob_size = _esm_blob_end - _esm_blob_start; + + /* + * If the ESM blob is too low it will be clobbered when the + * kernel relocates to its final location. In this case, + * allocate a safer place and move it. + */ + if (esm_blob_addr < vmlinux.size) { + void *old_addr = (void *)esm_blob_addr; + + printf("Allocating 0x%lx bytes for esm_blob ...\n\r", + esm_blob_size); + esm_blob_addr = (unsigned long)malloc(esm_blob_size); + if (!esm_blob_addr) + fatal("Can't allocate memory for ESM blob !\n\r"); + printf("Relocating ESM blob 0x%lx <- 0x%p (0x%lx bytes)\n\r", + esm_blob_addr, old_addr, esm_blob_size); + memmove((void *)esm_blob_addr, old_addr, esm_blob_size); + } + + /* Tell the kernel ESM blob address via device tree. */ + setprop_val(chosen, "linux,esm-blob-start", (u32)(esm_blob_addr)); + setprop_val(chosen, "linux,esm-blob-end", (u32)(esm_blob_addr + esm_blob_size)); +} +#else +static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { } +#endif + /* A buffer that may be edited by tools operating on a zImage binary so as to * edit the command line passed to vmlinux (by setting /chosen/bootargs). * The buffer is put in it's own section so that tools may locate it easier. @@ -214,6 +254,7 @@ void start(void) vmlinux = prep_kernel(); initrd = prep_initrd(vmlinux, chosen, loader_info.initrd_addr, loader_info.initrd_size); + prep_esm_blob(vmlinux, chosen); prep_cmdline(chosen); printf("Finalizing device tree..."); diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h index cd043726ed88..e0606766480f 100644 --- a/arch/powerpc/boot/ops.h +++ b/arch/powerpc/boot/ops.h @@ -251,6 +251,8 @@ extern char _initrd_start[]; extern char _initrd_end[]; extern char _dtb_start[]; extern char _dtb_end[]; +extern char _esm_blob_start[]; +extern char _esm_blob_end[]; static inline __attribute__((const)) int __ilog2_u32(u32 n) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 5148ac271f28..ed6266367bc0 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -13,6 +13,7 @@ # -i initrd specify initrd file # -d devtree specify device-tree blob # -s tree.dts specify device-tree source file (needs dtc installed) +# -e esm_blob specify ESM blob for secure images # -c cache $kernel.strip.gz (use if present & newer, else make) # -C prefix specify command prefix for cross-building tools # (strip, objcopy, ld) @@ -37,6 +38,7 @@ platform=of initrd= dtb= dts= +esm_blob= cacheit= binary= compression=.gz @@ -60,9 +62,9 @@ tmpdir=. usage() { echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 - echo ' [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2 - echo ' [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2 - echo ' [--no-compression] [vmlinux]' >&2 + echo ' [-d devtree] [-s tree.dts] [-e esm_blob]' >&2 + echo ' [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2 + echo ' [-Z (gz|xz|none)] [--no-compression] [vmlinux]' >&2 exit 1 } @@ -105,6 +107,11 @@ while [ "$#" -gt 0 ]; do [ "$#" -gt 0 ] || usage dtb="$1" ;; + -e) + shift + [ "$#" -gt 0 ] || usage + esm_blob="$1" + ;; -s) shift [ "$#" -gt 0 ] || usage @@ -218,9 +225,16 @@ objflags=-S tmp=$tmpdir/zImage.$$.o ksection=.kernel:vmlinux.strip isection=.kernel:initrd +esection=.kernel:esm_blob link_address='0x400000' make_space=y + +if [ -n "$esm_blob" -a "$platform" != "pseries" ]; then + echo "ESM blob not support on non-pseries platforms" >&2 + exit 1 +fi + case "$platform" in of) platformo="$object/of.o $object/epapr.o" @@ -477,6 +491,10 @@ if [ -n "$dtb" ]; then fi fi +if [ -n "$esm_blob" ]; then + addsec $tmp "$esm_blob" $esection +fi + if [ "$platform" != "miboot" ]; then if [ -n "$link_address" ] ; then text_start="-Ttext $link_address" diff --git a/arch/powerpc/boot/zImage.lds.S b/arch/powerpc/boot/zImage.lds.S index 4ac1e36edfe7..a21f3a76e06f 100644 --- a/arch/powerpc/boot/zImage.lds.S +++ b/arch/powerpc/boot/zImage.lds.S @@ -68,6 +68,14 @@ SECTIONS _initrd_end = .; } + . = ALIGN(4096); + .kernel:esm_blob : + { + _esm_blob_start = .; + *(.kernel:esm_blob) + _esm_blob_end = .; + } + #ifdef CONFIG_PPC64_BOOT_WRAPPER . = ALIGN(256); .got : -- cgit v1.2.3 From 6a9c930bd7751bf0630d8b9b73b07af5c6842da6 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Mon, 19 Aug 2019 23:13:14 -0300 Subject: powerpc/prom_init: Add the ESM call to prom_init Make the Enter-Secure-Mode (ESM) ultravisor call to switch the VM to secure mode. Pass kernel base address and FDT address so that the Ultravisor is able to verify the integrity of the VM using information from the ESM blob. Add "svm=" command line option to turn on switching to secure mode. Signed-off-by: Ram Pai [ andmike: Generate an RTAS os-term hcall when the ESM ucall fails. ] Signed-off-by: Michael Anderson [ bauerman: Cleaned up the code a bit. ] Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-5-bauerman@linux.ibm.com --- Documentation/admin-guide/kernel-parameters.txt | 5 ++ arch/powerpc/include/asm/ultravisor-api.h | 1 + arch/powerpc/kernel/prom_init.c | 96 +++++++++++++++++++++++++ 3 files changed, 102 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6d495aab4d0b..4923d8f726e8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4620,6 +4620,11 @@ /sys/power/pm_test). Only available when CONFIG_PM_DEBUG is set. Default value is 5. + svm= [PPC] + Format: { on | off | y | n | 1 | 0 } + This parameter controls use of the Protected + Execution Facility on pSeries. + swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h index 6a0f9c74f959..34c8711334cb 100644 --- a/arch/powerpc/include/asm/ultravisor-api.h +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -25,5 +25,6 @@ /* opcodes */ #define UV_WRITE_PATE 0xF104 #define UV_RETURN 0xF11C +#define UV_ESM 0xF110 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index f2b63b4e1943..a4e7762dd286 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -171,6 +172,10 @@ static bool __prombss prom_radix_disable; static bool __prombss prom_xive_disable; #endif +#ifdef CONFIG_PPC_SVM +static bool __prombss prom_svm_enable; +#endif + struct platform_support { bool hash_mmu; bool radix_mmu; @@ -812,6 +817,17 @@ static void __init early_cmdline_parse(void) prom_debug("XIVE disabled from cmdline\n"); } #endif /* CONFIG_PPC_PSERIES */ + +#ifdef CONFIG_PPC_SVM + opt = prom_strstr(prom_cmd_line, "svm="); + if (opt) { + bool val; + + opt += sizeof("svm=") - 1; + if (!prom_strtobool(opt, &val)) + prom_svm_enable = val; + } +#endif /* CONFIG_PPC_SVM */ } #ifdef CONFIG_PPC_PSERIES @@ -1712,6 +1728,43 @@ static void __init prom_close_stdin(void) } } +#ifdef CONFIG_PPC_SVM +static int prom_rtas_hcall(uint64_t args) +{ + register uint64_t arg1 asm("r3") = H_RTAS; + register uint64_t arg2 asm("r4") = args; + + asm volatile("sc 1\n" : "=r" (arg1) : + "r" (arg1), + "r" (arg2) :); + return arg1; +} + +static struct rtas_args __prombss os_term_args; + +static void __init prom_rtas_os_term(char *str) +{ + phandle rtas_node; + __be32 val; + u32 token; + + prom_debug("%s: start...\n", __func__); + rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas")); + prom_debug("rtas_node: %x\n", rtas_node); + if (!PHANDLE_VALID(rtas_node)) + return; + + val = 0; + prom_getprop(rtas_node, "ibm,os-term", &val, sizeof(val)); + token = be32_to_cpu(val); + prom_debug("ibm,os-term: %x\n", token); + if (token == 0) + prom_panic("Could not get token for ibm,os-term\n"); + os_term_args.token = cpu_to_be32(token); + prom_rtas_hcall((uint64_t)&os_term_args); +} +#endif /* CONFIG_PPC_SVM */ + /* * Allocate room for and instantiate RTAS */ @@ -3168,6 +3221,46 @@ static void unreloc_toc(void) #endif #endif +#ifdef CONFIG_PPC_SVM +/* + * Perform the Enter Secure Mode ultracall. + */ +static int enter_secure_mode(unsigned long kbase, unsigned long fdt) +{ + register unsigned long r3 asm("r3") = UV_ESM; + register unsigned long r4 asm("r4") = kbase; + register unsigned long r5 asm("r5") = fdt; + + asm volatile("sc 2" : "+r"(r3) : "r"(r4), "r"(r5)); + + return r3; +} + +/* + * Call the Ultravisor to transfer us to secure memory if we have an ESM blob. + */ +static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +{ + int ret; + + if (!prom_svm_enable) + return; + + /* Switch to secure mode. */ + prom_printf("Switching to secure mode.\n"); + + ret = enter_secure_mode(kbase, fdt); + if (ret != U_SUCCESS) { + prom_printf("Returned %d from switching to secure mode.\n", ret); + prom_rtas_os_term("Switch to secure mode failed.\n"); + } +} +#else +static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +{ +} +#endif /* CONFIG_PPC_SVM */ + /* * We enter here early on, when the Open Firmware prom is still * handling exceptions and the MMU hash table for us. @@ -3366,6 +3459,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unreloc_toc(); #endif + /* Move to secure memory if we're supposed to be secure guests. */ + setup_secure_guest(kbase, hdr); + __start(hdr, kbase, 0, 0, 0, 0, 0); return 0; -- cgit v1.2.3 From f7777e008cad17bf757ca256709070c07efd8283 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Mon, 19 Aug 2019 23:13:15 -0300 Subject: powerpc/pseries/svm: Add helpers for UV_SHARE_PAGE and UV_UNSHARE_PAGE These functions are used when the guest wants to grant the hypervisor access to certain pages. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-6-bauerman@linux.ibm.com --- arch/powerpc/include/asm/ultravisor-api.h | 2 ++ arch/powerpc/include/asm/ultravisor.h | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h index 34c8711334cb..0f5b2d718bfc 100644 --- a/arch/powerpc/include/asm/ultravisor-api.h +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -26,5 +26,7 @@ #define UV_WRITE_PATE 0xF104 #define UV_RETURN 0xF11C #define UV_ESM 0xF110 +#define UV_SHARE_PAGE 0xF130 +#define UV_UNSHARE_PAGE 0xF134 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h index d7aa97aa7834..e5c8413de06f 100644 --- a/arch/powerpc/include/asm/ultravisor.h +++ b/arch/powerpc/include/asm/ultravisor.h @@ -31,4 +31,14 @@ static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1) return ucall_norets(UV_WRITE_PATE, lpid, dw0, dw1); } +static inline int uv_share_page(u64 pfn, u64 npages) +{ + return ucall_norets(UV_SHARE_PAGE, pfn, npages); +} + +static inline int uv_unshare_page(u64 pfn, u64 npages) +{ + return ucall_norets(UV_UNSHARE_PAGE, pfn, npages); +} + #endif /* _ASM_POWERPC_ULTRAVISOR_H */ -- cgit v1.2.3 From 7f70c3815a352f76d69fc5642ae9685033a428c0 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 19 Aug 2019 23:13:16 -0300 Subject: powerpc: Introduce the MSR_S bit Protected Execution Facility (PEF) is an architectural change for POWER 9 that enables Secure Virtual Machines (SVMs). When enabled, PEF adds a new higher privileged mode, called Ultravisor mode, to POWER architecture. The hardware changes include the following: * There is a new bit in the MSR that determines whether the current process is running in secure mode, MSR(S) bit 41. MSR(S)=1, process is in secure mode, MSR(s)=0 process is in normal mode. * The MSR(S) bit can only be set by the Ultravisor. * HRFID cannot be used to set the MSR(S) bit. If the hypervisor needs to return to a SVM it must use an ultracall. It can determine if the VM it is returning to is secure. * The privilege of a process is now determined by three MSR bits, MSR(S, HV, PR). In each of the tables below the modes are listed from least privilege to highest privilege. The higher privilege modes can access all the resources of the lower privilege modes. **Secure Mode MSR Settings** +---+---+---+---------------+ | S | HV| PR|Privilege | +===+===+===+===============+ | 1 | 0 | 1 | Problem | +---+---+---+---------------+ | 1 | 0 | 0 | Privileged(OS)| +---+---+---+---------------+ | 1 | 1 | 0 | Ultravisor | +---+---+---+---------------+ | 1 | 1 | 1 | Reserved | +---+---+---+---------------+ **Normal Mode MSR Settings** +---+---+---+---------------+ | S | HV| PR|Privilege | +===+===+===+===============+ | 0 | 0 | 1 | Problem | +---+---+---+---------------+ | 0 | 0 | 0 | Privileged(OS)| +---+---+---+---------------+ | 0 | 1 | 0 | Hypervisor | +---+---+---+---------------+ | 0 | 1 | 1 | Problem (HV) | +---+---+---+---------------+ Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Ram Pai [ cclaudio: Update the commit message ] Signed-off-by: Claudio Carvalho Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-7-bauerman@linux.ibm.com --- arch/powerpc/include/asm/reg.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 10caa145f98b..ec3714cf0989 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -38,6 +38,7 @@ #define MSR_TM_LG 32 /* Trans Mem Available */ #define MSR_VEC_LG 25 /* Enable AltiVec */ #define MSR_VSX_LG 23 /* Enable VSX */ +#define MSR_S_LG 22 /* Secure state */ #define MSR_POW_LG 18 /* Enable Power Management */ #define MSR_WE_LG 18 /* Wait State Enable */ #define MSR_TGPR_LG 17 /* TLB Update registers in use */ @@ -71,11 +72,13 @@ #define MSR_SF __MASK(MSR_SF_LG) /* Enable 64 bit mode */ #define MSR_ISF __MASK(MSR_ISF_LG) /* Interrupt 64b mode valid on 630 */ #define MSR_HV __MASK(MSR_HV_LG) /* Hypervisor state */ +#define MSR_S __MASK(MSR_S_LG) /* Secure state */ #else /* so tests for these bits fail on 32-bit */ #define MSR_SF 0 #define MSR_ISF 0 #define MSR_HV 0 +#define MSR_S 0 #endif /* -- cgit v1.2.3 From e311a92da18cbdd4972dab0cda88b1b8484b8fef Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Mon, 19 Aug 2019 23:13:17 -0300 Subject: powerpc/pseries: Add and use LPPACA_SIZE constant Helps document what the hard-coded number means. Also take the opportunity to fix an #endif comment. Suggested-by: Alexey Kardashevskiy Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-8-bauerman@linux.ibm.com --- arch/powerpc/kernel/paca.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index e3ad8aa4730d..612fc87ef785 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -52,6 +52,8 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, #ifdef CONFIG_PPC_PSERIES +#define LPPACA_SIZE 0x400 + /* * See asm/lppaca.h for more detail. * @@ -65,7 +67,7 @@ static inline void init_lppaca(struct lppaca *lppaca) *lppaca = (struct lppaca) { .desc = cpu_to_be32(0xd397d781), /* "LpPa" */ - .size = cpu_to_be16(0x400), + .size = cpu_to_be16(LPPACA_SIZE), .fpregs_in_use = 1, .slb_count = cpu_to_be16(64), .vmxregs_in_use = 0, @@ -75,19 +77,18 @@ static inline void init_lppaca(struct lppaca *lppaca) static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) { struct lppaca *lp; - size_t size = 0x400; - BUILD_BUG_ON(size < sizeof(struct lppaca)); + BUILD_BUG_ON(sizeof(struct lppaca) > LPPACA_SIZE); if (early_cpu_has_feature(CPU_FTR_HVMODE)) return NULL; - lp = alloc_paca_data(size, 0x400, limit, cpu); + lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); init_lppaca(lp); return lp; } -#endif /* CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_PPC_PSERIES */ #ifdef CONFIG_PPC_BOOK3S_64 -- cgit v1.2.3 From bd104e6db6f0ad124e507a9ecf1a468efe5697db Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 19 Aug 2019 23:13:18 -0300 Subject: powerpc/pseries/svm: Use shared memory for LPPACA structures LPPACA structures need to be shared with the host. Hence they need to be in shared memory. Instead of allocating individual chunks of memory for a given structure from memblock, a contiguous chunk of memory is allocated and then converted into shared memory. Subsequent allocation requests will come from the contiguous chunk which will be always shared memory for all structures. While we are able to use a kmem_cache constructor for the Debug Trace Log, LPPACAs are allocated very early in the boot process (before SLUB is available) so we need to use a simpler scheme here. Introduce helper is_svm_platform() which uses the S bit of the MSR to tell whether we're running as a secure guest. Signed-off-by: Anshuman Khandual Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-9-bauerman@linux.ibm.com --- arch/powerpc/include/asm/svm.h | 26 +++++++++++++++++++++++++ arch/powerpc/kernel/paca.c | 43 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/svm.h diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h new file mode 100644 index 000000000000..2689d8d841f8 --- /dev/null +++ b/arch/powerpc/include/asm/svm.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * SVM helper functions + * + * Copyright 2018 Anshuman Khandual, IBM Corporation. + */ + +#ifndef _ASM_POWERPC_SVM_H +#define _ASM_POWERPC_SVM_H + +#ifdef CONFIG_PPC_SVM + +static inline bool is_secure_guest(void) +{ + return mfmsr() & MSR_S; +} + +#else /* CONFIG_PPC_SVM */ + +static inline bool is_secure_guest(void) +{ + return false; +} + +#endif /* CONFIG_PPC_SVM */ +#endif /* _ASM_POWERPC_SVM_H */ diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 612fc87ef785..949eceb254d8 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "setup.h" @@ -54,6 +56,41 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, #define LPPACA_SIZE 0x400 +static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align, + unsigned long limit, int cpu) +{ + size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE); + static unsigned long shared_lppaca_size; + static void *shared_lppaca; + void *ptr; + + if (!shared_lppaca) { + memblock_set_bottom_up(true); + + shared_lppaca = + memblock_alloc_try_nid(shared_lppaca_total_size, + PAGE_SIZE, MEMBLOCK_LOW_LIMIT, + limit, NUMA_NO_NODE); + if (!shared_lppaca) + panic("cannot allocate shared data"); + + memblock_set_bottom_up(false); + uv_share_page(PHYS_PFN(__pa(shared_lppaca)), + shared_lppaca_total_size >> PAGE_SHIFT); + } + + ptr = shared_lppaca + shared_lppaca_size; + shared_lppaca_size += size; + + /* + * This is very early in boot, so no harm done if the kernel crashes at + * this point. + */ + BUG_ON(shared_lppaca_size >= shared_lppaca_total_size); + + return ptr; +} + /* * See asm/lppaca.h for more detail. * @@ -83,7 +120,11 @@ static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) if (early_cpu_has_feature(CPU_FTR_HVMODE)) return NULL; - lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); + if (is_secure_guest()) + lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu); + else + lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); + init_lppaca(lp); return lp; -- cgit v1.2.3 From d5394c059da9786043934bed474562cde124e7d3 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 19 Aug 2019 23:13:19 -0300 Subject: powerpc/pseries/svm: Use shared memory for Debug Trace Log (DTL) Secure guests need to share the DTL buffers with the hypervisor. To that end, use a kmem_cache constructor which converts the underlying buddy allocated SLUB cache pages into shared memory. Signed-off-by: Anshuman Khandual Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-10-bauerman@linux.ibm.com --- arch/powerpc/include/asm/svm.h | 5 +++++ arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/setup.c | 5 ++++- arch/powerpc/platforms/pseries/svm.c | 40 +++++++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/platforms/pseries/svm.c diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h index 2689d8d841f8..85580b30aba4 100644 --- a/arch/powerpc/include/asm/svm.h +++ b/arch/powerpc/include/asm/svm.h @@ -15,6 +15,9 @@ static inline bool is_secure_guest(void) return mfmsr() & MSR_S; } +void dtl_cache_ctor(void *addr); +#define get_dtl_cache_ctor() (is_secure_guest() ? dtl_cache_ctor : NULL) + #else /* CONFIG_PPC_SVM */ static inline bool is_secure_guest(void) @@ -22,5 +25,7 @@ static inline bool is_secure_guest(void) return false; } +#define get_dtl_cache_ctor() NULL + #endif /* CONFIG_PPC_SVM */ #endif /* _ASM_POWERPC_SVM_H */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index ab3d59aeacca..a420ef4c9d8e 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_PPC_SVM) += svm.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index f5940cc71c37..d8930c3a8a11 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -69,6 +69,7 @@ #include #include #include +#include #include "pseries.h" #include "../../../../drivers/pci/pci.h" @@ -297,8 +298,10 @@ static inline int alloc_dispatch_logs(void) static int alloc_dispatch_log_kmem_cache(void) { + void (*ctor)(void *) = get_dtl_cache_ctor(); + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, NULL); + DISPATCH_LOG_BYTES, 0, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c new file mode 100644 index 000000000000..2b2b1a77ca1e --- /dev/null +++ b/arch/powerpc/platforms/pseries/svm.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Secure VM platform + * + * Copyright 2018 IBM Corporation + * Author: Anshuman Khandual + */ + +#include +#include + +/* There's one dispatch log per CPU. */ +#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE) + +static struct page *dtl_page_store[NR_DTL_PAGE]; +static long dtl_nr_pages; + +static bool is_dtl_page_shared(struct page *page) +{ + long i; + + for (i = 0; i < dtl_nr_pages; i++) + if (dtl_page_store[i] == page) + return true; + + return false; +} + +void dtl_cache_ctor(void *addr) +{ + unsigned long pfn = PHYS_PFN(__pa(addr)); + struct page *page = pfn_to_page(pfn); + + if (!is_dtl_page_shared(page)) { + dtl_page_store[dtl_nr_pages] = page; + dtl_nr_pages++; + WARN_ON(dtl_nr_pages >= NR_DTL_PAGE); + uv_share_page(pfn, 1); + } +} -- cgit v1.2.3 From 256ba2c1689efd4f5383cf7ebe2f9970c198b79d Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Mon, 19 Aug 2019 23:13:20 -0300 Subject: powerpc/pseries/svm: Unshare all pages before kexecing a new kernel A new kernel deserves a clean slate. Any pages shared with the hypervisor is unshared before invoking the new kernel. However there are exceptions. If the new kernel is invoked to dump the current kernel, or if there is a explicit request to preserve the state of the current kernel, unsharing of pages is skipped. NOTE: While testing crashkernel, make sure at least 256M is reserved for crashkernel. Otherwise SWIOTLB allocation will fail and crash kernel will fail to boot. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-11-bauerman@linux.ibm.com --- arch/powerpc/include/asm/ultravisor-api.h | 1 + arch/powerpc/include/asm/ultravisor.h | 5 +++++ arch/powerpc/kernel/machine_kexec_64.c | 9 +++++++++ 3 files changed, 15 insertions(+) diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h index 0f5b2d718bfc..4fcda1d5793d 100644 --- a/arch/powerpc/include/asm/ultravisor-api.h +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -28,5 +28,6 @@ #define UV_ESM 0xF110 #define UV_SHARE_PAGE 0xF130 #define UV_UNSHARE_PAGE 0xF134 +#define UV_UNSHARE_ALL_PAGES 0xF140 #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h index e5c8413de06f..b1bc2e043ed4 100644 --- a/arch/powerpc/include/asm/ultravisor.h +++ b/arch/powerpc/include/asm/ultravisor.h @@ -41,4 +41,9 @@ static inline int uv_unshare_page(u64 pfn, u64 npages) return ucall_norets(UV_UNSHARE_PAGE, pfn, npages); } +static inline int uv_unshare_all_pages(void) +{ + return ucall_norets(UV_UNSHARE_ALL_PAGES); +} + #endif /* _ASM_POWERPC_ULTRAVISOR_H */ diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 18481b0e2788..04a7cba58eff 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include int default_machine_kexec_prepare(struct kimage *image) { @@ -327,6 +329,13 @@ void default_machine_kexec(struct kimage *image) #ifdef CONFIG_PPC_PSERIES kexec_paca.lppaca_ptr = NULL; #endif + + if (is_secure_guest() && !(image->preserve_context || + image->type == KEXEC_TYPE_CRASH)) { + uv_unshare_all_pages(); + printk("kexec: Unshared all shared pages.\n"); + } + paca_ptrs[kexec_paca.paca_index] = &kexec_paca; setup_paca(&kexec_paca); -- cgit v1.2.3 From 734560ac39aeb2516419c5878856df011f794a74 Mon Sep 17 00:00:00 2001 From: Ryan Grimm Date: Mon, 19 Aug 2019 23:13:21 -0300 Subject: powerpc/pseries/svm: Export guest SVM status to user space via sysfs User space might want to know it's running in a secure VM. It can't do a mfmsr because mfmsr is a privileged instruction. The solution here is to create a cpu attribute: /sys/devices/system/cpu/svm which will read 0 or 1 based on the S bit of the current CPU. Signed-off-by: Ryan Grimm Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-12-bauerman@linux.ibm.com --- Documentation/ABI/testing/sysfs-devices-system-cpu | 10 ++++++++++ arch/powerpc/kernel/sysfs.c | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 5f7d7b14fa44..06d0931119cc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -562,3 +562,13 @@ Description: Umwait control or C0.2 state. The time is an unsigned 32-bit number. Note that a value of zero means there is no limit. Low order two bits must be zero. + +What: /sys/devices/system/cpu/svm +Date: August 2019 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: Secure Virtual Machine + + If 1, it means the system is using the Protected Execution + Facility in POWER9 and newer processors. i.e., it is a Secure + Virtual Machine. diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e2147d7c9e72..80a676da11cb 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "cacheinfo.h" #include "setup.h" @@ -715,6 +716,23 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ +#ifdef CONFIG_PPC_SVM +static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", is_secure_guest()); +} +static DEVICE_ATTR(svm, 0444, show_svm, NULL); + +static void create_svm_file(void) +{ + device_create_file(cpu_subsys.dev_root, &dev_attr_svm); +} +#else +static void create_svm_file(void) +{ +} +#endif /* CONFIG_PPC_SVM */ + static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -1058,6 +1076,8 @@ static int __init topology_init(void) sysfs_create_dscr_default(); #endif /* CONFIG_PPC64 */ + create_svm_file(); + return 0; } subsys_initcall(topology_init); -- cgit v1.2.3 From 4edaac512c8a31b0aa715b58ee00ee06df642fa2 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Mon, 19 Aug 2019 23:13:22 -0300 Subject: powerpc/pseries/svm: Disable doorbells in SVM guests Normally, the HV emulates some instructions like MSGSNDP, MSGCLRP from a KVM guest. To emulate the instructions, it must first read the instruction from the guest's memory and decode its parameters. However for a secure guest (aka SVM), the page containing the instruction is in secure memory and the HV cannot access directly. It would need the Ultravisor (UV) to facilitate accessing the instruction and parameters but the UV currently does not have the support for such accesses. Until the UV has such support, disable doorbells in SVMs. This might incur a performance hit but that is yet to be quantified. With this patch applied (needed only in SVMs not needed for HV) we are able to launch SVM guests with multi-core support. Eg: qemu -smp sockets=2,cores=2,threads=2. Fix suggested by Benjamin Herrenschmidt. Thanks to input from Paul Mackerras, Ram Pai and Michael Anderson. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-13-bauerman@linux.ibm.com --- arch/powerpc/platforms/pseries/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 4b3ef8d9c63f..ad61e90032da 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "pseries.h" #include "offline_states.h" @@ -221,7 +222,7 @@ static __init void pSeries_smp_probe_xics(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) + if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) smp_ops->cause_ipi = smp_pseries_cause_ipi; else smp_ops->cause_ipi = icp_ops->cause_ipi; -- cgit v1.2.3 From edea902c1c1efb855f77e041f9daf1abe7a9768a Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Mon, 19 Aug 2019 23:13:23 -0300 Subject: powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests Secure guest memory is inacessible to devices so regular DMA isn't possible. In that case set devices' dma_map_ops to NULL so that the generic DMA code path will use SWIOTLB to bounce buffers for DMA. Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-14-bauerman@linux.ibm.com --- arch/powerpc/platforms/pseries/iommu.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index df7db33ca93b..6ba081dd61c9 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "pseries.h" @@ -1319,7 +1320,15 @@ void iommu_init_early_pSeries(void) of_reconfig_notifier_register(&iommu_reconfig_nb); register_memory_notifier(&iommu_mem_nb); - set_pci_dma_ops(&dma_iommu_ops); + /* + * Secure guest memory is inacessible to devices so regular DMA isn't + * possible. + * + * In that case keep devices' dma_map_ops as NULL so that the generic + * DMA code path will use SWIOTLB to bounce buffers for DMA. + */ + if (!is_secure_guest()) + set_pci_dma_ops(&dma_iommu_ops); } static int __init disable_multitce(char *str) -- cgit v1.2.3 From 2efbc58f157a39ad9e9199b92d9c47736023a2fe Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 19 Aug 2019 23:13:24 -0300 Subject: powerpc/pseries/svm: Force SWIOTLB for secure guests SWIOTLB checks range of incoming CPU addresses to be bounced and sees if the device can access it through its DMA window without requiring bouncing. In such cases it just chooses to skip bouncing. But for cases like secure guests on powerpc platform all addresses need to be bounced into the shared pool of memory because the host cannot access it otherwise. Hence the need to do the bouncing is not related to device's DMA window and use of bounce buffers is forced by setting swiotlb_force. Also, connect the shared memory conversion functions into the ARCH_HAS_MEM_ENCRYPT hooks and call swiotlb_update_mem_attributes() to convert SWIOTLB's memory pool to shared memory. Signed-off-by: Anshuman Khandual [ bauerman: Use ARCH_HAS_MEM_ENCRYPT hooks to share swiotlb memory pool. ] Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-15-bauerman@linux.ibm.com --- arch/powerpc/include/asm/mem_encrypt.h | 26 ++++++++++++++++++++ arch/powerpc/platforms/pseries/Kconfig | 3 +++ arch/powerpc/platforms/pseries/svm.c | 45 ++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100644 arch/powerpc/include/asm/mem_encrypt.h diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h new file mode 100644 index 000000000000..ba9dab07c1be --- /dev/null +++ b/arch/powerpc/include/asm/mem_encrypt.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * SVM helper functions + * + * Copyright 2018 IBM Corporation + */ + +#ifndef _ASM_POWERPC_MEM_ENCRYPT_H +#define _ASM_POWERPC_MEM_ENCRYPT_H + +#include + +static inline bool mem_encrypt_active(void) +{ + return is_secure_guest(); +} + +static inline bool force_dma_unencrypted(struct device *dev) +{ + return is_secure_guest(); +} + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */ diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index d09deb05bb66..9e35cddddf73 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -149,6 +149,9 @@ config PAPR_SCM config PPC_SVM bool "Secure virtual machine (SVM) support for POWER" depends on PPC_PSERIES + select SWIOTLB + select ARCH_HAS_MEM_ENCRYPT + select ARCH_HAS_FORCE_DMA_UNENCRYPTED help There are certain POWER platforms which support secure guests using the Protected Execution Facility, with the help of an Ultravisor diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 2b2b1a77ca1e..40c0637203d5 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -7,8 +7,53 @@ */ #include +#include +#include +#include #include +static int __init init_svm(void) +{ + if (!is_secure_guest()) + return 0; + + /* Don't release the SWIOTLB buffer. */ + ppc_swiotlb_enable = 1; + + /* + * Since the guest memory is inaccessible to the host, devices always + * need to use the SWIOTLB buffer for DMA even if dma_capable() says + * otherwise. + */ + swiotlb_force = SWIOTLB_FORCE; + + /* Share the SWIOTLB buffer with the host. */ + swiotlb_update_mem_attributes(); + + return 0; +} +machine_early_initcall(pseries, init_svm); + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_unshare_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_share_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + /* There's one dispatch log per CPU. */ #define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE) -- cgit v1.2.3 From bf75a8db72e9454aadeca0276f6e6b0e9161861c Mon Sep 17 00:00:00 2001 From: Ryan Grimm Date: Mon, 19 Aug 2019 23:13:26 -0300 Subject: powerpc/configs: Enable secure guest support in pseries and ppc64 defconfigs Enables running as a secure guest in platforms with an Ultravisor. Signed-off-by: Ryan Grimm Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190820021326.6884-17-bauerman@linux.ibm.com --- arch/powerpc/configs/ppc64_defconfig | 1 + arch/powerpc/configs/pseries_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index dc83fefa04f7..b250e6f5a7ca 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -29,6 +29,7 @@ CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y +CONFIG_PPC_SVM=y CONFIG_PPC_MAPLE=y CONFIG_PPC_PASEMI=y CONFIG_PPC_PASEMI_IOMMU=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 38abc9c1770a..26126b4d4de3 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -42,6 +42,7 @@ CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y +CONFIG_PPC_SVM=y # CONFIG_PPC_PMAC is not set CONFIG_RTAS_FLASH=m CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y -- cgit v1.2.3 From 0be9f7fd5d8fd984b34ad98838ef7cfd0079ddae Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:26 +1000 Subject: powerpc/64s/exception: machine check fwnmi remove HV case fwnmi does not trigger in HV mode, so remove always-true feature test. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-2-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 520804351601..515ea0243ff2 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1026,9 +1026,8 @@ TRAMP_REAL_BEGIN(machine_check_pSeries) .globl machine_check_fwnmi machine_check_fwnmi: EXCEPTION_PROLOG_0 PACA_EXMC -BEGIN_FTR_SECTION b machine_check_common_early -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) + machine_check_pSeries_0: EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 /* -- cgit v1.2.3 From 1039f62431e2aa16487cd6d64bc841d71f6465b8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:27 +1000 Subject: powerpc/64s/exception: machine check remove bitrotted comment Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-3-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 515ea0243ff2..8cf4e44d2d76 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -933,10 +933,6 @@ EXC_COMMON_BEGIN(system_reset_common) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) - /* This is moved out of line as it can be patched by FW, but - * some code path might still want to branch into the original - * vector - */ EXCEPTION_PROLOG_0 PACA_EXMC BEGIN_FTR_SECTION b machine_check_common_early -- cgit v1.2.3 From 19dbe673e62b076f00ae2841fcf5898b4728d5ab Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:28 +1000 Subject: powerpc/64s/exception: machine check fix KVM guest test The machine_check_handle_early hypervisor guest test is skipped if !HVMODE or MSR[HV]=0, which is wrong for PR or nested hypervisors that could be running a guest in this state. Test HSTATE_IN_GUEST up front and use that to branch out to the KVM handler, then MSR[PR] alone can test for this kernel's userspace. This matches all other interrupt handling. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-4-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 8cf4e44d2d76..7c28c22fc6a6 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1108,11 +1108,8 @@ EXC_COMMON_BEGIN(machine_check_handle_early) bl machine_check_early std r3,RESULT(r1) /* Save result */ ld r12,_MSR(r1) -BEGIN_FTR_SECTION - b 4f -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) -#ifdef CONFIG_PPC_P7_NAP +#ifdef CONFIG_PPC_P7_NAP /* * Check if thread was in power saving mode. We come here when any * of the following is true: @@ -1128,30 +1125,26 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif - /* - * Check if we are coming from hypervisor userspace. If yes then we - * continue in host kernel in V mode to deliver the MC event. - */ - rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ - beq 5f -4: andi. r11,r12,MSR_PR /* See if coming from user. */ - bne 9f /* continue in V mode if we are. */ - -5: #ifdef CONFIG_KVM_BOOK3S_64_HANDLER -BEGIN_FTR_SECTION /* - * We are coming from kernel context. Check if we are coming from - * guest. if yes, then we can continue. We will fall through - * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest. + * Check if we are coming from guest. If yes, then run the normal + * exception handler which will take the do_kvm_200->kvmppc_interrupt + * branch to deliver the MC event to guest. */ lbz r11,HSTATE_IN_GUEST(r13) cmpwi r11,0 /* Check if coming from guest */ bne 9f /* continue if we are. */ -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) #endif + + /* + * Check if we are coming from userspace. If yes, then run the normal + * exception handler which will deliver the MC event to this kernel. + */ + andi. r11,r12,MSR_PR /* See if coming from user. */ + bne 9f /* continue in V mode if we are. */ + /* - * At this point we are not sure about what context we come from. + * At this point we are coming from kernel context. * Queue up the MCE event and return from the interrupt. * But before that, check if this is an un-recoverable exception. * If yes, then stay on emergency stack and panic. -- cgit v1.2.3 From fe9d482b1d87c76441492e51d866cee652eee4d5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:29 +1000 Subject: powerpc/64s/exception: machine check adjust RFI target The host kernel delivery case for powernv does RFI_TO_USER_OR_KERNEL, but should just use RFI_TO_KERNEL which makes it clear this is not a user case. This is not a bug because RFI_TO_USER_OR_KERNEL deals with kernel returns just fine. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-5-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7c28c22fc6a6..9a7cc3edc721 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1184,7 +1184,7 @@ BEGIN_FTR_SECTION */ bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP - RFI_TO_USER_OR_KERNEL + RFI_TO_KERNEL FTR_SECTION_ELSE /* * pSeries: Return from MC interrupt. Before that stay on emergency -- cgit v1.2.3 From b5c27f7c5679c3726148fd25ad220b4560d210cf Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:30 +1000 Subject: powerpc/64s/exception: machine check pseries should always run the early handler Now that pseries with fwnmi registered runs the early machine check handler, there is no good reason to special case the non-fwnmi case and skip the early handler. Reducing the code and number of paths is a top priority for asm code, it's better to handle this in C where possible (and the pseries early handler is a no-op if fwnmi is not registered). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-6-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9a7cc3edc721..73be4f9027de 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -934,11 +934,7 @@ EXC_COMMON_BEGIN(system_reset_common) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) EXCEPTION_PROLOG_0 PACA_EXMC -BEGIN_FTR_SECTION b machine_check_common_early -FTR_SECTION_ELSE - b machine_check_pSeries_0 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) TRAMP_REAL_BEGIN(machine_check_common_early) -- cgit v1.2.3 From fa2760eca504f554a5adb6cd2f576828933c4c7b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:31 +1000 Subject: powerpc/64s/exception: machine check remove machine_check_pSeries_0 branch This label has only one caller, so unwind the branch and move it inline. The location of the comment is adjusted to match similar one in system reset. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-7-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 73be4f9027de..577d93a83d05 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1014,20 +1014,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) b 1b b . /* prevent speculative execution */ -TRAMP_REAL_BEGIN(machine_check_pSeries) - .globl machine_check_fwnmi -machine_check_fwnmi: +#ifdef CONFIG_PPC_PSERIES +TRAMP_REAL_BEGIN(machine_check_fwnmi) EXCEPTION_PROLOG_0 PACA_EXMC b machine_check_common_early - -machine_check_pSeries_0: - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 - /* - * MSR_RI is not enabled, because PACA_EXMC is being used, so a - * nested machine check corrupts it. machine_check_common enables - * MSR_RI. - */ - EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 +#endif TRAMP_KVM_SKIP(PACA_EXMC, 0x200) @@ -1197,7 +1188,13 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP EXCEPTION_PROLOG_0 PACA_EXMC - b machine_check_pSeries_0 + EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 + EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 + /* + * MSR_RI is not enabled, because PACA_EXMC is being used, so a + * nested machine check corrupts it. machine_check_common enables + * MSR_RI. + */ EXC_COMMON_BEGIN(unrecover_mce) /* Invoke machine_check_exception to print MCE event and panic. */ -- cgit v1.2.3 From 0b66370c61fcf5fcc1d6901013e110284da6e2bb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:32 +1000 Subject: powerpc/64s/exception: machine check use correct cfar for late handler Bare metal machine checks run an "early" handler in real mode before running the main handler which reports the event. The main handler runs exactly as a normal interrupt handler, after the "windup" which sets registers back as they were at interrupt entry. CFAR does not get restored by the windup code, so that will be wrong when the handler is run. Restore the CFAR to the saved value before running the late handler. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-8-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 577d93a83d05..b7c4149cb91c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1186,6 +1186,10 @@ FTR_SECTION_ELSE ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) 9: /* Deliver the machine check to host kernel in V mode. */ +BEGIN_FTR_SECTION + ld r10,ORIG_GPR3(r1) + mtspr SPRN_CFAR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) MACHINE_CHECK_HANDLER_WINDUP EXCEPTION_PROLOG_0 PACA_EXMC EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 -- cgit v1.2.3 From 7290f3b3d3e66b54720f23079ffc60e0b7bbb0cc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:33 +1000 Subject: powerpc/64s/powernv: machine check dump SLB contents Re-use the code introduced in pseries to save and dump the contents of the SLB in the case of an SLB involved machine check exception. This patch also avoids allocating the SLB save array on pseries radix. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-9-npiggin@gmail.com --- arch/powerpc/kernel/mce.c | 6 ++++++ arch/powerpc/kernel/mce_power.c | 4 ++++ arch/powerpc/platforms/powernv/setup.c | 9 +++++++++ arch/powerpc/platforms/pseries/setup.c | 24 +++++++++++++----------- 4 files changed, 32 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index ec4b3e1087be..04280a5871fc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -511,6 +511,12 @@ void machine_check_print_event_info(struct machine_check_event *evt, subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ? mc_error_class[evt->error_class] : "Unknown"; printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Display faulty slb contents for SLB errors. */ + if (evt->error_type == MCE_ERROR_TYPE_SLB) + slb_dump_contents(local_paca->mce_faulty_slbs); +#endif } EXPORT_SYMBOL_GPL(machine_check_print_event_info); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index b6cbe3449358..356e7b99f661 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -405,6 +405,8 @@ static int mce_handle_ierror(struct pt_regs *regs, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); handled = mce_flush(MCE_FLUSH_SLB); break; case MCE_ERROR_TYPE_ERAT: @@ -490,6 +492,8 @@ static int mce_handle_derror(struct pt_regs *regs, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); if (mce_flush(MCE_FLUSH_SLB)) handled = 1; break; diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a5e52f9eed3c..83498604d322 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -166,6 +167,14 @@ static void __init pnv_init(void) else #endif add_preferred_console("hvc", 0, NULL); + + if (!radix_enabled()) { + int i; + + /* Allocate per cpu area to save old slb contents during MCE */ + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i)); + } } static void __init pnv_init_IRQ(void) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index d8930c3a8a11..b955d54628ff 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -142,17 +142,19 @@ static void __init fwnmi_init(void) } #ifdef CONFIG_PPC_BOOK3S_64 - /* Allocate per cpu slb area to save old slb contents during MCE */ - size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; - slb_ptr = memblock_alloc_try_nid_raw(size, sizeof(struct slb_entry), - MEMBLOCK_LOW_LIMIT, ppc64_rma_size, - NUMA_NO_NODE); - if (!slb_ptr) - panic("Failed to allocate %zu bytes below %pa for slb area\n", - size, &ppc64_rma_size); - - for_each_possible_cpu(i) - paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + if (!radix_enabled()) { + /* Allocate per cpu area to save old slb contents during MCE */ + size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus; + slb_ptr = memblock_alloc_try_nid_raw(size, + sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT, + ppc64_rma_size, NUMA_NO_NODE); + if (!slb_ptr) + panic("Failed to allocate %zu bytes below %pa for slb area\n", + size, &ppc64_rma_size); + + for_each_possible_cpu(i) + paca_ptrs[i]->mce_faulty_slbs = slb_ptr + (mmu_slb_size * i); + } #endif } -- cgit v1.2.3 From 9ca766f9891d23743b4e1a7b1cafdc63723cd6a7 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:34 +1000 Subject: powerpc/64s/pseries: machine check convert to use common event code The common machine_check_event data structures and queues are mostly platform independent, with powernv decoding SRR1/DSISR/etc., into machine_check_event objects. This patch converts pseries to use this infrastructure by decoding fwnmi/rtas data into machine_check_event objects. This allows queueing to be used by a subsequent change to delay the virtual mode handling of machine checks that occur in kernel space where it is unsafe to switch immediately to virtual mode, similarly to powernv. Signed-off-by: Nicholas Piggin [mpe: Fix implicit fallthrough warnings in mce_handle_error()] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-10-npiggin@gmail.com --- arch/powerpc/include/asm/mce.h | 6 + arch/powerpc/kernel/mce.c | 34 ++- arch/powerpc/platforms/pseries/ras.c | 460 +++++++++++++++-------------------- 3 files changed, 233 insertions(+), 267 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 19a33707d5ef..6a6ddaabdb34 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -30,6 +30,10 @@ enum MCE_Disposition { enum MCE_Initiator { MCE_INITIATOR_UNKNOWN = 0, MCE_INITIATOR_CPU = 1, + MCE_INITIATOR_PCI = 2, + MCE_INITIATOR_ISA = 3, + MCE_INITIATOR_MEMORY= 4, + MCE_INITIATOR_POWERMGM = 5, }; enum MCE_ErrorType { @@ -41,6 +45,8 @@ enum MCE_ErrorType { MCE_ERROR_TYPE_USER = 5, MCE_ERROR_TYPE_RA = 6, MCE_ERROR_TYPE_LINK = 7, + MCE_ERROR_TYPE_DCACHE = 8, + MCE_ERROR_TYPE_ICACHE = 9, }; enum MCE_ErrorClass { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 04280a5871fc..34c1001e9e8b 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -325,7 +325,7 @@ static void machine_check_process_queued_event(struct irq_work *work) void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest) { - const char *level, *sevstr, *subtype, *err_type; + const char *level, *sevstr, *subtype, *err_type, *initiator; uint64_t ea = 0, pa = 0; int n = 0; char dar_str[50]; @@ -410,6 +410,28 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } + switch(evt->initiator) { + case MCE_INITIATOR_CPU: + initiator = "CPU"; + break; + case MCE_INITIATOR_PCI: + initiator = "PCI"; + break; + case MCE_INITIATOR_ISA: + initiator = "ISA"; + break; + case MCE_INITIATOR_MEMORY: + initiator = "Memory"; + break; + case MCE_INITIATOR_POWERMGM: + initiator = "Power Management"; + break; + case MCE_INITIATOR_UNKNOWN: + default: + initiator = "Unknown"; + break; + } + switch (evt->error_type) { case MCE_ERROR_TYPE_UE: err_type = "UE"; @@ -476,6 +498,14 @@ void machine_check_print_event_info(struct machine_check_event *evt, if (evt->u.link_error.effective_address_provided) ea = evt->u.link_error.effective_address; break; + case MCE_ERROR_TYPE_DCACHE: + err_type = "D-Cache"; + subtype = "Unknown"; + break; + case MCE_ERROR_TYPE_ICACHE: + err_type = "I-Cache"; + subtype = "Unknown"; + break; default: case MCE_ERROR_TYPE_UNKNOWN: err_type = "Unknown"; @@ -508,6 +538,8 @@ void machine_check_print_event_info(struct machine_check_event *evt, level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str); } + printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator); + subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ? mc_error_class[evt->error_class] : "Unknown"; printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index f16fdd0f71f7..3acdcc3bb908 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -76,6 +76,7 @@ struct pseries_mc_errorlog { #define MC_ERROR_TYPE_UE 0x00 #define MC_ERROR_TYPE_SLB 0x01 #define MC_ERROR_TYPE_ERAT 0x02 +#define MC_ERROR_TYPE_UNKNOWN 0x03 #define MC_ERROR_TYPE_TLB 0x04 #define MC_ERROR_TYPE_D_CACHE 0x05 #define MC_ERROR_TYPE_I_CACHE 0x07 @@ -87,6 +88,9 @@ struct pseries_mc_errorlog { #define MC_ERROR_UE_LOAD_STORE 3 #define MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE 4 +#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 +#define UE_LOGICAL_ADDR_PROVIDED 0x20 + #define MC_ERROR_SLB_PARITY 0 #define MC_ERROR_SLB_MULTIHIT 1 #define MC_ERROR_SLB_INDETERMINATE 2 @@ -113,27 +117,6 @@ static inline u8 rtas_mc_error_sub_type(const struct pseries_mc_errorlog *mlog) } } -static -inline u64 rtas_mc_get_effective_addr(const struct pseries_mc_errorlog *mlog) -{ - __be64 addr = 0; - - switch (mlog->error_type) { - case MC_ERROR_TYPE_UE: - if (mlog->sub_err_type & 0x40) - addr = mlog->effective_address; - break; - case MC_ERROR_TYPE_SLB: - case MC_ERROR_TYPE_ERAT: - case MC_ERROR_TYPE_TLB: - if (mlog->sub_err_type & 0x80) - addr = mlog->effective_address; - default: - break; - } - return be64_to_cpu(addr); -} - /* * Enable the hotplug interrupt late because processing them may touch other * devices or systems (e.g. hugepages) that have not been initialized at the @@ -511,160 +494,165 @@ int pSeries_system_reset_exception(struct pt_regs *regs) return 0; /* need to perform reset */ } -#define VAL_TO_STRING(ar, val) \ - (((val) < ARRAY_SIZE(ar)) ? ar[(val)] : "Unknown") -static void pseries_print_mce_info(struct pt_regs *regs, - struct rtas_error_log *errp) +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) { - const char *level, *sevstr; + struct mce_error_info mce_err = { 0 }; + unsigned long eaddr = 0, paddr = 0; struct pseries_errorlog *pseries_log; struct pseries_mc_errorlog *mce_log; - u8 error_type, err_sub_type; - u64 addr; - u8 initiator = rtas_error_initiator(errp); int disposition = rtas_error_disposition(errp); + int initiator = rtas_error_initiator(errp); + int severity = rtas_error_severity(errp); + u8 error_type, err_sub_type; - static const char * const initiators[] = { - [0] = "Unknown", - [1] = "CPU", - [2] = "PCI", - [3] = "ISA", - [4] = "Memory", - [5] = "Power Mgmt", - }; - static const char * const mc_err_types[] = { - [0] = "UE", - [1] = "SLB", - [2] = "ERAT", - [3] = "Unknown", - [4] = "TLB", - [5] = "D-Cache", - [6] = "Unknown", - [7] = "I-Cache", - }; - static const char * const mc_ue_types[] = { - [0] = "Indeterminate", - [1] = "Instruction fetch", - [2] = "Page table walk ifetch", - [3] = "Load/Store", - [4] = "Page table walk Load/Store", - }; - - /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ - static const char * const mc_slb_types[] = { - [0] = "Parity", - [1] = "Multihit", - [2] = "Indeterminate", - }; - - /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ - static const char * const mc_soft_types[] = { - [0] = "Unknown", - [1] = "Parity", - [2] = "Multihit", - [3] = "Indeterminate", - }; - - if (!rtas_error_extended(errp)) { - pr_err("Machine check interrupt: Missing extended error log\n"); - return; - } + if (initiator == RTAS_INITIATOR_UNKNOWN) + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + else if (initiator == RTAS_INITIATOR_CPU) + mce_err.initiator = MCE_INITIATOR_CPU; + else if (initiator == RTAS_INITIATOR_PCI) + mce_err.initiator = MCE_INITIATOR_PCI; + else if (initiator == RTAS_INITIATOR_ISA) + mce_err.initiator = MCE_INITIATOR_ISA; + else if (initiator == RTAS_INITIATOR_MEMORY) + mce_err.initiator = MCE_INITIATOR_MEMORY; + else if (initiator == RTAS_INITIATOR_POWERMGM) + mce_err.initiator = MCE_INITIATOR_POWERMGM; + else + mce_err.initiator = MCE_INITIATOR_UNKNOWN; + + if (severity == RTAS_SEVERITY_NO_ERROR) + mce_err.severity = MCE_SEV_NO_ERROR; + else if (severity == RTAS_SEVERITY_EVENT) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_WARNING) + mce_err.severity = MCE_SEV_WARNING; + else if (severity == RTAS_SEVERITY_ERROR_SYNC) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_ERROR) + mce_err.severity = MCE_SEV_SEVERE; + else if (severity == RTAS_SEVERITY_FATAL) + mce_err.severity = MCE_SEV_FATAL; + else + mce_err.severity = MCE_SEV_FATAL; + + if (severity <= RTAS_SEVERITY_ERROR_SYNC) + mce_err.sync_error = true; + else + mce_err.sync_error = false; + + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err.error_class = MCE_ECLASS_UNKNOWN; + + if (!rtas_error_extended(errp)) + goto out; pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); if (pseries_log == NULL) - return; + goto out; mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; err_sub_type = rtas_mc_error_sub_type(mce_log); - switch (rtas_error_severity(errp)) { - case RTAS_SEVERITY_NO_ERROR: - level = KERN_INFO; - sevstr = "Harmless"; - break; - case RTAS_SEVERITY_WARNING: - level = KERN_WARNING; - sevstr = ""; - break; - case RTAS_SEVERITY_ERROR: - case RTAS_SEVERITY_ERROR_SYNC: - level = KERN_ERR; - sevstr = "Severe"; - break; - case RTAS_SEVERITY_FATAL: - default: - level = KERN_ERR; - sevstr = "Fatal"; - break; - } + switch (mce_log->error_type) { + case MC_ERROR_TYPE_UE: + mce_err.error_type = MCE_ERROR_TYPE_UE; + switch (err_sub_type) { + case MC_ERROR_UE_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; + break; + case MC_ERROR_UE_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; + break; + case MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE: + mce_err.u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; + break; + case MC_ERROR_UE_INDETERMINATE: + default: + mce_err.u.ue_error_type = MCE_UE_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) + eaddr = be64_to_cpu(mce_log->effective_address); -#ifdef CONFIG_PPC_BOOK3S_64 - /* Display faulty slb contents for SLB errors. */ - if (error_type == MC_ERROR_TYPE_SLB) - slb_dump_contents(local_paca->mce_faulty_slbs); -#endif + if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { + paddr = be64_to_cpu(mce_log->logical_address); + } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { + unsigned long pfn; - printk("%s%s Machine check interrupt [%s]\n", level, sevstr, - disposition == RTAS_DISP_FULLY_RECOVERED ? - "Recovered" : "Not recovered"); - if (user_mode(regs)) { - printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, - regs->nip, current->pid, current->comm); - } else { - printk("%s NIP [%016lx]: %pS\n", level, regs->nip, - (void *)regs->nip); - } - printk("%s Initiator: %s\n", level, - VAL_TO_STRING(initiators, initiator)); + pfn = addr_to_pfn(regs, eaddr); + if (pfn != ULONG_MAX) + paddr = pfn << PAGE_SHIFT; + } - switch (error_type) { - case MC_ERROR_TYPE_UE: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_ue_types, err_sub_type)); break; case MC_ERROR_TYPE_SLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_slb_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_SLB; + switch (err_sub_type) { + case MC_ERROR_SLB_PARITY: + mce_err.u.slb_error_type = MCE_SLB_ERROR_PARITY; + break; + case MC_ERROR_SLB_MULTIHIT: + mce_err.u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; + break; + case MC_ERROR_SLB_INDETERMINATE: + default: + mce_err.u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); break; case MC_ERROR_TYPE_ERAT: + mce_err.error_type = MCE_ERROR_TYPE_ERAT; + switch (err_sub_type) { + case MC_ERROR_ERAT_PARITY: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_PARITY; + break; + case MC_ERROR_ERAT_MULTIHIT: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; + break; + case MC_ERROR_ERAT_INDETERMINATE: + default: + mce_err.u.erat_error_type = MCE_ERAT_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; case MC_ERROR_TYPE_TLB: - printk("%s Error type: %s [%s]\n", level, - VAL_TO_STRING(mc_err_types, error_type), - VAL_TO_STRING(mc_soft_types, err_sub_type)); + mce_err.error_type = MCE_ERROR_TYPE_TLB; + switch (err_sub_type) { + case MC_ERROR_TLB_PARITY: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_PARITY; + break; + case MC_ERROR_TLB_MULTIHIT: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; + break; + case MC_ERROR_TLB_INDETERMINATE: + default: + mce_err.u.tlb_error_type = MCE_TLB_ERROR_INDETERMINATE; + break; + } + if (mce_log->sub_err_type & 0x80) + eaddr = be64_to_cpu(mce_log->effective_address); + break; + case MC_ERROR_TYPE_D_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; break; + case MC_ERROR_TYPE_I_CACHE: + mce_err.error_type = MCE_ERROR_TYPE_DCACHE; + break; + case MC_ERROR_TYPE_UNKNOWN: default: - printk("%s Error type: %s\n", level, - VAL_TO_STRING(mc_err_types, error_type)); + mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN; break; } - addr = rtas_mc_get_effective_addr(mce_log); - if (addr) - printk("%s Effective address: %016llx\n", level, addr); -} - -static int mce_handle_error(struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - int disposition = rtas_error_disposition(errp); - u8 error_type; - - if (!rtas_error_extended(errp)) - goto out; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (pseries_log == NULL) - goto out; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - error_type = mce_log->error_type; - #ifdef CONFIG_PPC_BOOK3S_64 if (disposition == RTAS_DISP_NOT_RECOVERED) { switch (error_type) { @@ -682,98 +670,24 @@ static int mce_handle_error(struct rtas_error_log *errp) slb_save_contents(local_paca->mce_faulty_slbs); flush_and_reload_slb(); disposition = RTAS_DISP_FULLY_RECOVERED; - rtas_set_disposition_recovered(errp); break; default: break; } + } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + printk(KERN_ERR "MCE: limited recovery, system may " + "be degraded\n"); + disposition = RTAS_DISP_FULLY_RECOVERED; } #endif out: - return disposition; -} - -#ifdef CONFIG_MEMORY_FAILURE - -static DEFINE_PER_CPU(int, rtas_ue_count); -static DEFINE_PER_CPU(unsigned long, rtas_ue_paddr[MAX_MC_EVT]); + save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED, + &mce_err, regs->nip, eaddr, paddr); -#define UE_EFFECTIVE_ADDR_PROVIDED 0x40 -#define UE_LOGICAL_ADDR_PROVIDED 0x20 - - -static void pseries_hwpoison_work_fn(struct work_struct *work) -{ - unsigned long paddr; - int index; - - while (__this_cpu_read(rtas_ue_count) > 0) { - index = __this_cpu_read(rtas_ue_count) - 1; - paddr = __this_cpu_read(rtas_ue_paddr[index]); - memory_failure(paddr >> PAGE_SHIFT, 0); - __this_cpu_dec(rtas_ue_count); - } -} - -static DECLARE_WORK(hwpoison_work, pseries_hwpoison_work_fn); - -static void queue_ue_paddr(unsigned long paddr) -{ - int index; - - index = __this_cpu_inc_return(rtas_ue_count) - 1; - if (index >= MAX_MC_EVT) { - __this_cpu_dec(rtas_ue_count); - return; - } - this_cpu_write(rtas_ue_paddr[index], paddr); - schedule_work(&hwpoison_work); -} - -static void pseries_do_memory_failure(struct pt_regs *regs, - struct pseries_mc_errorlog *mce_log) -{ - unsigned long paddr; - - if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) { - paddr = be64_to_cpu(mce_log->logical_address); - } else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) { - unsigned long pfn; - - pfn = addr_to_pfn(regs, - be64_to_cpu(mce_log->effective_address)); - if (pfn == ULONG_MAX) - return; - paddr = pfn << PAGE_SHIFT; - } else { - return; - } - queue_ue_paddr(paddr); -} - -static void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) -{ - struct pseries_errorlog *pseries_log; - struct pseries_mc_errorlog *mce_log; - - if (!rtas_error_extended(errp)) - return; - - pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); - if (!pseries_log) - return; - - mce_log = (struct pseries_mc_errorlog *)pseries_log->data; - - if (mce_log->error_type == MC_ERROR_TYPE_UE) - pseries_do_memory_failure(regs, mce_log); + return disposition; } -#else -static inline void pseries_process_ue(struct pt_regs *regs, - struct rtas_error_log *errp) { } -#endif /*CONFIG_MEMORY_FAILURE */ /* * Process MCE rtas errlog event. @@ -795,49 +709,51 @@ static void mce_process_errlog_event(struct irq_work *work) * Return 1 if corrected (or delivered a signal). * Return 0 if there is nothing we can do. */ -static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) +static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - int disposition = rtas_error_disposition(err); - - pseries_print_mce_info(regs, err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; - - } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { /* Platform corrected itself */ recovered = 1; + } else if (evt->severity == MCE_SEV_FATAL) { + /* Fatal machine check */ + pr_err("Machine check interrupt is fatal\n"); + recovered = 0; + } - } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { - /* Platform corrected itself but could be degraded */ - printk(KERN_ERR "MCE: limited recovery, system may " - "be degraded\n"); - recovered = 1; - - } else if (user_mode(regs) && !is_global_init(current) && - rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { - + if (!recovered && evt->sync_error) { /* - * If we received a synchronous error when in userspace - * kill the task. Firmware may report details of the fail - * asynchronously, so we can't rely on the target and type - * fields being valid here. + * Try to kill processes if we get a synchronous machine check + * (e.g., one caused by execution of this instruction). This + * will devolve into a panic if we try to kill init or are in + * an interrupt etc. + * + * TODO: Queue up this address for hwpoisioning later. + * TODO: This is not quite right for d-side machine + * checks ->nip is not necessarily the important + * address. */ - printk(KERN_ERR "MCE: uncorrectable error, killing task " - "%s:%d\n", current->comm, current->pid); - - _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); - recovered = 1; + if ((user_mode(regs))) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else if (die_will_crash()) { + /* + * die() would kill the kernel, so better to go via + * the platform reboot code that will log the + * machine check. + */ + recovered = 0; + } else { + die("Machine check", regs, SIGBUS); + recovered = 1; + } } - pseries_process_ue(regs, err); - - /* Queue irq work to log this rtas event later. */ - irq_work_queue(&mce_errlog_process_work); - return recovered; } @@ -853,14 +769,21 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) */ int pSeries_machine_check_exception(struct pt_regs *regs) { - struct rtas_error_log *errp; + struct machine_check_event evt; - if (fwnmi_active) { - fwnmi_release_errinfo(); - errp = fwnmi_get_errlog(); - if (errp && recover_mce(regs, errp)) - return 1; + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return 0; + + /* Print things out */ + if (evt.version != MCE_V1) { + pr_err("Machine Check Exception, Unknown event version %d !\n", + evt.version); + return 0; } + machine_check_print_event_info(&evt, user_mode(regs), false); + + if (recover_mce(regs, &evt)) + return 1; return 0; } @@ -877,7 +800,12 @@ long pseries_machine_check_realmode(struct pt_regs *regs) * to panic. Hence we will call it as soon as we go into * virtual mode. */ - disposition = mce_handle_error(errp); + disposition = mce_handle_error(regs, errp); + fwnmi_release_errinfo(); + + /* Queue irq work to log this rtas event later. */ + irq_work_queue(&mce_errlog_process_work); + if (disposition == RTAS_DISP_FULLY_RECOVERED) return 1; } -- cgit v1.2.3 From 272f636445cf556498c8840dc63ad1218e94391b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:35 +1000 Subject: powerpc/64s/exception: machine check pseries should skip the late handler for kernel MCEs The powernv machine check handler copes with taking a MCE from one of three contexts, guest, kernel, and user. In each case the early handler runs first on a special stack, then: - The guest case branches to the KVM interrupt handler (via standard interrupt macros). - The user case will run the "late" handler which is like a normal interrupt that runs in virtual mode and uses the regular kernel stack. - The kernel case queues the event and schedules it for processing with irq work. The last case is important, it must not enable virtual memory because the MMU state may not be set up to deal with that (e.g., SLB might be clear), it must not use the regular kernel stack for similar reasons (e.g., might be in OPAL with OPAL stack in r1), and the kernel does not expect anything to touch its stack if interrupts are disabled. The pseries handler does not do this queueing, but instead it always runs the late handler for host MCEs, which has some of the same problems. Now that pseries is using machine_check_events, change it to do the same as powernv and queue events for kernel MCEs. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-11-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b7c4149cb91c..aa3720c0b8fe 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1163,7 +1163,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) cmpdi r3,0 /* see if we handled MCE successfully */ beq 1b /* if !handled then panic */ -BEGIN_FTR_SECTION + /* * Return from MC interrupt. * Queue up the MCE event so that we can log it later, while @@ -1172,18 +1172,7 @@ BEGIN_FTR_SECTION bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP RFI_TO_KERNEL -FTR_SECTION_ELSE - /* - * pSeries: Return from MC interrupt. Before that stay on emergency - * stack and call machine_check_exception to log the MCE event. - */ - LOAD_HANDLER(r10,mce_return) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + 9: /* Deliver the machine check to host kernel in V mode. */ BEGIN_FTR_SECTION @@ -1212,13 +1201,6 @@ EXC_COMMON_BEGIN(unrecover_mce) bl unrecoverable_exception b 1b -EXC_COMMON_BEGIN(mce_return) - /* Invoke machine_check_exception to print MCE event and return. */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception - MACHINE_CHECK_HANDLER_WINDUP - RFI_TO_KERNEL - b . EXC_REAL_BEGIN(data_access, 0x300, 0x80) EXCEPTION_PROLOG_0 PACA_EXGEN -- cgit v1.2.3 From c8eb54dbc8087c3d114cb583925395e211bfffa4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:36 +1000 Subject: powerpc/64s/exception: machine check restructure to reuse common macros Follow the pattern of sreset and HMI handlers more closely: use EXCEPTION_PROLOG_COMMON_1 rather than open-coding it, and run the handler at the relocated location. This helps later simplification and code sharing. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-12-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 71 ++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index aa3720c0b8fe..a1f0a88d39a5 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -934,17 +934,23 @@ EXC_COMMON_BEGIN(system_reset_common) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) EXCEPTION_PROLOG_0 PACA_EXMC - b machine_check_common_early + EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0 + mfctr r10 /* save ctr, even for !RELOCATABLE */ + BRANCH_TO_C000(r11, machine_check_early_common) + /* + * MSR_RI is not enabled, because PACA_EXMC is being used, so a + * nested machine check corrupts it. machine_check_common enables + * MSR_RI. + */ EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) -TRAMP_REAL_BEGIN(machine_check_common_early) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 0, 0, 0 + +EXC_COMMON_BEGIN(machine_check_early_common) + mtctr r10 /* Restore ctr */ + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 + /* - * Register contents: - * R13 = PACA - * R9 = CR - * Original R9 to R13 is saved on PACA_EXMC - * * Switch to mc_emergency stack and handle re-entrancy (we limit * the nested MCE upto level 4 to avoid stack overflow). * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1 @@ -965,32 +971,30 @@ TRAMP_REAL_BEGIN(machine_check_common_early) * the machine check is handled then the idle wakeup code is called * to restore state. */ - mr r11,r1 /* Save r1 */ lhz r10,PACA_IN_MCE(r13) cmpwi r10,0 /* Are we in nested machine check */ - bne 0f /* Yes, we are. */ - /* First machine check entry */ - ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ -0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + cmpwi cr1,r10,MAX_MCE_DEPTH /* Are we at maximum nesting */ addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) + + mr r10,r1 /* Save r1 */ + bne 1f + /* First machine check entry */ + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ +1: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,MAX_MCE_DEPTH - bgt 2f /* Check if we hit limit of 4 */ - std r11,GPR1(r1) /* Save r1 on the stack. */ - std r11,0(r1) /* make stack chain pointer */ - mfspr r11,SPRN_SRR0 /* Save SRR0 */ - std r11,_NIP(r1) - mfspr r11,SPRN_SRR1 /* Save SRR1 */ - std r11,_MSR(r1) - mfspr r11,SPRN_DAR /* Save DAR */ - std r11,_DAR(r1) - mfspr r11,SPRN_DSISR /* Save DSISR */ - std r11,_DSISR(r1) - std r9,_CCR(r1) /* Save CR in stackframe */ + bge cr1,2f /* Check if we hit limit of 4 */ + + EXCEPTION_PROLOG_COMMON_1() /* We don't touch AMR here, we never go to virtual mode */ - /* Save r9 through r13 from EXMC save area to stack frame. */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) + EXCEPTION_PROLOG_COMMON_3(0x200) + + ld r3,PACA_EXMC+EX_DAR(r13) + lwz r4,PACA_EXMC+EX_DSISR(r13) + std r3,_DAR(r1) + std r4,_DSISR(r1) + mfmsr r11 /* get MSR value */ BEGIN_FTR_SECTION ori r11,r11,MSR_ME /* turn on ME bit */ @@ -1016,8 +1020,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) #ifdef CONFIG_PPC_PSERIES TRAMP_REAL_BEGIN(machine_check_fwnmi) + /* See comment at machine_check exception, don't turn on RI */ EXCEPTION_PROLOG_0 PACA_EXMC - b machine_check_common_early + EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0 + mfctr r10 /* save ctr */ + BRANCH_TO_C000(r11, machine_check_early_common) #endif TRAMP_KVM_SKIP(PACA_EXMC, 0x200) @@ -1088,8 +1095,6 @@ EXC_COMMON_BEGIN(machine_check_idle_common) * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. */ EXC_COMMON_BEGIN(machine_check_handle_early) - std r0,GPR0(r1) /* Save r0 */ - EXCEPTION_PROLOG_COMMON_3(0x200) bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_early @@ -1180,14 +1185,10 @@ BEGIN_FTR_SECTION mtspr SPRN_CFAR,r10 END_FTR_SECTION_IFSET(CPU_FTR_CFAR) MACHINE_CHECK_HANDLER_WINDUP + /* See comment at machine_check exception, don't turn on RI */ EXCEPTION_PROLOG_0 PACA_EXMC EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 - /* - * MSR_RI is not enabled, because PACA_EXMC is being used, so a - * nested machine check corrupts it. machine_check_common enables - * MSR_RI. - */ EXC_COMMON_BEGIN(unrecover_mce) /* Invoke machine_check_exception to print MCE event and panic. */ -- cgit v1.2.3 From abd1f4ca2b41ffba768c3baadc006a95d178fbf1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:37 +1000 Subject: powerpc/64s/exception: machine check move tramp code Following convention, move the tramp code (unrelocated) above the common handlers (relocated). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-13-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a1f0a88d39a5..06821b199511 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -945,6 +945,17 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) +#ifdef CONFIG_PPC_PSERIES +TRAMP_REAL_BEGIN(machine_check_fwnmi) + /* See comment at machine_check exception, don't turn on RI */ + EXCEPTION_PROLOG_0 PACA_EXMC + EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0 + mfctr r10 /* save ctr */ + BRANCH_TO_C000(r11, machine_check_early_common) +#endif + +TRAMP_KVM_SKIP(PACA_EXMC, 0x200) + EXC_COMMON_BEGIN(machine_check_early_common) mtctr r10 /* Restore ctr */ mfspr r11,SPRN_SRR0 @@ -1018,17 +1029,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) b 1b b . /* prevent speculative execution */ -#ifdef CONFIG_PPC_PSERIES -TRAMP_REAL_BEGIN(machine_check_fwnmi) - /* See comment at machine_check exception, don't turn on RI */ - EXCEPTION_PROLOG_0 PACA_EXMC - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0 - mfctr r10 /* save ctr */ - BRANCH_TO_C000(r11, machine_check_early_common) -#endif - -TRAMP_KVM_SKIP(PACA_EXMC, 0x200) - EXC_COMMON_BEGIN(machine_check_common) /* * Machine check is different because we use a different -- cgit v1.2.3 From 296e753fb447eb14c8c9fd6a7c48e7ffab269343 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:38 +1000 Subject: powerpc/64s/exception: simplify machine check early path machine_check_handle_early_common can reach machine_check_handle_early directly now that it runs at the relocated address, so just branch directly. The rfi sequence is required to enable MSR[ME] but that step is moved into a helper function, making the code easier to follow. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-14-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 06821b199511..bbbcab88cf78 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1006,16 +1006,13 @@ EXC_COMMON_BEGIN(machine_check_early_common) std r3,_DAR(r1) std r4,_DSISR(r1) - mfmsr r11 /* get MSR value */ BEGIN_FTR_SECTION - ori r11,r11,MSR_ME /* turn on ME bit */ + bl enable_machine_check END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ori r11,r11,MSR_RI /* turn on RI bit */ - LOAD_HANDLER(r12, machine_check_handle_early) -1: mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r11 - RFI_TO_KERNEL - b . /* prevent speculative execution */ + li r10,MSR_RI + mtmsrd r10,1 + b machine_check_handle_early + 2: /* Stack overflow. Stay on emergency stack and panic. * Keep the ME bit off while panic-ing, so that if we hit @@ -1026,7 +1023,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) LOAD_HANDLER(r12, unrecover_mce) li r10,MSR_ME andc r11,r11,r10 /* Turn off MSR_ME */ - b 1b + mtspr SPRN_SRR0,r12 + mtspr SPRN_SRR1,r11 + RFI_TO_KERNEL b . /* prevent speculative execution */ EXC_COMMON_BEGIN(machine_check_common) @@ -2269,6 +2268,20 @@ CLOSE_FIXED_SECTION(virt_trampolines); USE_TEXT_SECTION() +/* MSR[RI] should be clear because this uses SRR[01] */ +enable_machine_check: + mflr r0 + bcl 20,31,$+4 +0: mflr r3 + addi r3,r3,(1f - 0b) + mtspr SPRN_SRR0,r3 + mfmsr r3 + ori r3,r3,MSR_ME + mtspr SPRN_SRR1,r3 + RFI_TO_KERNEL +1: mtlr r0 + blr + /* * Hash table stuff */ -- cgit v1.2.3 From b7d9ccec3056913528690c5fae7cc86a5ea3dffc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:39 +1000 Subject: powerpc/64s/exception: machine check move unrecoverable handling out of line Similarly to the previous change, all callers of the unrecoverable handler run relocated so can reach it with a direct branch. This makes it easy to move out of line, which makes the "normal" path less cluttered and easier to follow. MSR[ME] manipulation still requires the rfi, so that is moved out of line to its own function. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-15-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 87 ++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index bbbcab88cf78..af18d0f1d4ab 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -992,9 +992,9 @@ EXC_COMMON_BEGIN(machine_check_early_common) bne 1f /* First machine check entry */ ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ -1: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - /* Limit nested MCE to level 4 to avoid stack overflow */ - bge cr1,2f /* Check if we hit limit of 4 */ +1: /* Limit nested MCE to level 4 to avoid stack overflow */ + bgt cr1,unrecoverable_mce /* Check if we hit limit of 4 */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ EXCEPTION_PROLOG_COMMON_1() /* We don't touch AMR here, we never go to virtual mode */ @@ -1013,21 +1013,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) mtmsrd r10,1 b machine_check_handle_early -2: - /* Stack overflow. Stay on emergency stack and panic. - * Keep the ME bit off while panic-ing, so that if we hit - * another machine check we checkstop. - */ - addi r1,r1,INT_FRAME_SIZE /* go back to previous stack frame */ - ld r11,PACAKMSR(r13) - LOAD_HANDLER(r12, unrecover_mce) - li r10,MSR_ME - andc r11,r11,r10 /* Turn off MSR_ME */ - mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r11 - RFI_TO_KERNEL - b . /* prevent speculative execution */ - EXC_COMMON_BEGIN(machine_check_common) /* * Machine check is different because we use a different @@ -1141,32 +1126,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) * If yes, then stay on emergency stack and panic. */ andi. r11,r12,MSR_RI - bne 2f -1: mfspr r11,SPRN_SRR0 - LOAD_HANDLER(r10,unrecover_mce) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) - /* - * We are going down. But there are chances that we might get hit by - * another MCE during panic path and we may run into unstable state - * with no way out. Hence, turn ME bit off while going down, so that - * when another MCE is hit during panic path, system will checkstop - * and hypervisor will get restarted cleanly by SP. - */ - li r3,MSR_ME - andc r10,r10,r3 /* Turn off MSR_ME */ - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . -2: + beq unrecoverable_mce + /* * Check if we have successfully handled/recovered from error, if not * then stay on emergency stack and panic. */ ld r3,RESULT(r1) /* Load result */ cmpdi r3,0 /* see if we handled MCE successfully */ - - beq 1b /* if !handled then panic */ + beq unrecoverable_mce /* if !handled then panic */ /* * Return from MC interrupt. @@ -1189,17 +1157,35 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 -EXC_COMMON_BEGIN(unrecover_mce) +EXC_COMMON_BEGIN(unrecoverable_mce) + /* + * We are going down. But there are chances that we might get hit by + * another MCE during panic path and we may run into unstable state + * with no way out. Hence, turn ME bit off while going down, so that + * when another MCE is hit during panic path, system will checkstop + * and hypervisor will get restarted cleanly by SP. + */ +BEGIN_FTR_SECTION + li r10,0 /* clear MSR_RI */ + mtmsrd r10,1 + bl disable_machine_check +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) + ld r10,PACAKMSR(r13) + li r3,MSR_ME + andc r10,r10,r3 + mtmsrd r10 + /* Invoke machine_check_exception to print MCE event and panic. */ addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception + /* - * We will not reach here. Even if we did, there is no way out. Call - * unrecoverable_exception and die. + * We will not reach here. Even if we did, there is no way out. + * Call unrecoverable_exception and die. */ -1: addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception - b 1b + b . EXC_REAL_BEGIN(data_access, 0x300, 0x80) @@ -2282,6 +2268,21 @@ enable_machine_check: 1: mtlr r0 blr +/* MSR[RI] should be clear because this uses SRR[01] */ +disable_machine_check: + mflr r0 + bcl 20,31,$+4 +0: mflr r3 + addi r3,r3,(1f - 0b) + mtspr SPRN_SRR0,r3 + mfmsr r3 + li r4,MSR_ME + andc r3,r3,r4 + mtspr SPRN_SRR1,r3 + RFI_TO_KERNEL +1: mtlr r0 + blr + /* * Hash table stuff */ -- cgit v1.2.3 From fce16d482276f059b08368c833a1188ac1f25e86 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:40 +1000 Subject: powerpc/64s/exception: untangle early machine check handler branch machine_check_early_common now branches to machine_check_handle_early which is its only caller. Move interleaving code out of the way, and remove the branch. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-16-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 129 +++++++++++++++++------------------ 1 file changed, 62 insertions(+), 67 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index af18d0f1d4ab..3a7f18021365 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -956,6 +956,16 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi) TRAMP_KVM_SKIP(PACA_EXMC, 0x200) +#define MACHINE_CHECK_HANDLER_WINDUP \ + /* Clear MSR_RI before setting SRR0 and SRR1. */\ + li r9,0; \ + mtmsrd r9,1; /* Clear MSR_RI */ \ + /* Decrement paca->in_mce now RI is clear. */ \ + lhz r12,PACA_IN_MCE(r13); \ + subi r12,r12,1; \ + sth r12,PACA_IN_MCE(r13); \ + EXCEPTION_RESTORE_REGS EXC_STD + EXC_COMMON_BEGIN(machine_check_early_common) mtctr r10 /* Restore ctr */ mfspr r11,SPRN_SRR0 @@ -1011,74 +1021,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) li r10,MSR_RI mtmsrd r10,1 - b machine_check_handle_early -EXC_COMMON_BEGIN(machine_check_common) - /* - * Machine check is different because we use a different - * save area: PACA_EXMC instead of PACA_EXGEN. - */ - EXCEPTION_COMMON(PACA_EXMC, 0x200) - FINISH_NAP - RECONCILE_IRQ_STATE(r10, r11) - ld r3,PACA_EXMC+EX_DAR(r13) - lwz r4,PACA_EXMC+EX_DSISR(r13) - /* Enable MSR_RI when finished with PACA_EXMC */ - li r10,MSR_RI - mtmsrd r10,1 - std r3,_DAR(r1) - std r4,_DSISR(r1) - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception - b ret_from_except - -#define MACHINE_CHECK_HANDLER_WINDUP \ - /* Clear MSR_RI before setting SRR0 and SRR1. */\ - li r9,0; \ - mtmsrd r9,1; /* Clear MSR_RI */ \ - /* Decrement paca->in_mce now RI is clear. */ \ - lhz r12,PACA_IN_MCE(r13); \ - subi r12,r12,1; \ - sth r12,PACA_IN_MCE(r13); \ - EXCEPTION_RESTORE_REGS EXC_STD - -#ifdef CONFIG_PPC_P7_NAP -/* - * This is an idle wakeup. Low level machine check has already been - * done. Queue the event then call the idle code to do the wake up. - */ -EXC_COMMON_BEGIN(machine_check_idle_common) - bl machine_check_queue_event - - /* - * We have not used any non-volatile GPRs here, and as a rule - * most exception code including machine check does not. - * Therefore PACA_NAPSTATELOST does not need to be set. Idle - * wakeup will restore volatile registers. - * - * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce. - * - * Then decrement MCE nesting after finishing with the stack. - */ - ld r3,_MSR(r1) - ld r4,_LINK(r1) - - lhz r11,PACA_IN_MCE(r13) - subi r11,r11,1 - sth r11,PACA_IN_MCE(r13) - - mtlr r4 - rlwinm r10,r3,47-31,30,31 - cmpwi cr1,r10,2 - bltlr cr1 /* no state loss, return to idle caller */ - b idle_return_gpr_loss -#endif - /* - * Handle machine check early in real mode. We come here with - * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. - */ -EXC_COMMON_BEGIN(machine_check_handle_early) bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_early @@ -1157,6 +1100,58 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 +EXC_COMMON_BEGIN(machine_check_common) + /* + * Machine check is different because we use a different + * save area: PACA_EXMC instead of PACA_EXGEN. + */ + EXCEPTION_COMMON(PACA_EXMC, 0x200) + FINISH_NAP + RECONCILE_IRQ_STATE(r10, r11) + ld r3,PACA_EXMC+EX_DAR(r13) + lwz r4,PACA_EXMC+EX_DSISR(r13) + /* Enable MSR_RI when finished with PACA_EXMC */ + li r10,MSR_RI + mtmsrd r10,1 + std r3,_DAR(r1) + std r4,_DSISR(r1) + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_exception + b ret_from_except + +#ifdef CONFIG_PPC_P7_NAP +/* + * This is an idle wakeup. Low level machine check has already been + * done. Queue the event then call the idle code to do the wake up. + */ +EXC_COMMON_BEGIN(machine_check_idle_common) + bl machine_check_queue_event + + /* + * We have not used any non-volatile GPRs here, and as a rule + * most exception code including machine check does not. + * Therefore PACA_NAPSTATELOST does not need to be set. Idle + * wakeup will restore volatile registers. + * + * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce. + * + * Then decrement MCE nesting after finishing with the stack. + */ + ld r3,_MSR(r1) + ld r4,_LINK(r1) + + lhz r11,PACA_IN_MCE(r13) + subi r11,r11,1 + sth r11,PACA_IN_MCE(r13) + + mtlr r4 + rlwinm r10,r3,47-31,30,31 + cmpwi cr1,r10,2 + bltlr cr1 /* no state loss, return to idle caller */ + b idle_return_gpr_loss +#endif + EXC_COMMON_BEGIN(unrecoverable_mce) /* * We are going down. But there are chances that we might get hit by -- cgit v1.2.3 From b3fe35261e329e15736bc95630fd865df9896c66 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:41 +1000 Subject: powerpc/64s/exception: machine check improve labels and comments Short forward and backward branches can be given number labels, but larger significant divergences in code path a more readable if they're given descriptive names. Also adjusts a comment to account for guest delivery. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-17-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3a7f18021365..c2474c9c8d41 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1052,7 +1052,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) */ lbz r11,HSTATE_IN_GUEST(r13) cmpwi r11,0 /* Check if coming from guest */ - bne 9f /* continue if we are. */ + bne mce_deliver /* continue if we are. */ #endif /* @@ -1060,7 +1060,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) * exception handler which will deliver the MC event to this kernel. */ andi. r11,r12,MSR_PR /* See if coming from user. */ - bne 9f /* continue in V mode if we are. */ + bne mce_deliver /* continue in V mode if we are. */ /* * At this point we are coming from kernel context. @@ -1088,8 +1088,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) MACHINE_CHECK_HANDLER_WINDUP RFI_TO_KERNEL -9: - /* Deliver the machine check to host kernel in V mode. */ +mce_deliver: + /* + * This is a host user or guest MCE. Restore all registers, then + * run the "late" handler. For host user, this will run the + * machine_check_exception handler in virtual mode like a normal + * interrupt handler. For guest, this will trigger the KVM test + * and branch to the KVM interrupt similarly to other interrupts. + */ BEGIN_FTR_SECTION ld r10,ORIG_GPR3(r1) mtspr SPRN_CFAR,r10 -- cgit v1.2.3 From c31f7134dc53f7020b3d49e846d1b950a761e324 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:42 +1000 Subject: powerpc/64s/exception: Fix DAR load for handle_page_fault error case This buglet goes back to before the 64/32 arch merge, but it does not seem to have had practical consequences because bad_page_fault does not use the 2nd argument, but rather regs->dar/nip. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-18-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index c2474c9c8d41..d44f6d103014 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2335,7 +2335,7 @@ handle_page_fault: bl save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD - lwz r4,_DAR(r1) + ld r4,_DAR(r1) bl bad_page_fault b ret_from_except -- cgit v1.2.3 From a243281195c338489ec5088380821f64340cb82f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:43 +1000 Subject: powerpc/64s/exception: move head-64.h exception code to exception-64s.S The head-64.h code should deal only with the head code sections and offset calculations. No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-19-npiggin@gmail.com --- arch/powerpc/include/asm/head-64.h | 41 ------------------------------------ arch/powerpc/kernel/exceptions-64s.S | 41 ++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index a466765709a9..2dabcf668292 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -169,47 +169,6 @@ name: #define ABS_ADDR(label) (label - fs_label + fs_start) -#define EXC_REAL_BEGIN(name, start, size) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size) - -#define EXC_REAL_END(name, start, size) \ - FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size) - -#define EXC_VIRT_BEGIN(name, start, size) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) - -#define EXC_VIRT_END(name, start, size) \ - FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) - -#define EXC_COMMON_BEGIN(name) \ - USE_TEXT_SECTION(); \ - .balign IFETCH_ALIGN_BYTES; \ - .global name; \ - _ASM_NOKPROBE_SYMBOL(name); \ - DEFINE_FIXED_SYMBOL(name); \ -name: - -#define TRAMP_REAL_BEGIN(name) \ - FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name) - -#define TRAMP_VIRT_BEGIN(name) \ - FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name) - -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -#define TRAMP_KVM_BEGIN(name) \ - TRAMP_VIRT_BEGIN(name) -#else -#define TRAMP_KVM_BEGIN(name) -#endif - -#define EXC_REAL_NONE(start, size) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \ - FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size) - -#define EXC_VIRT_NONE(start, size) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \ - FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size) - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_HEAD_64_H */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d44f6d103014..febeea021939 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -43,6 +43,47 @@ .endif #endif +#define EXC_REAL_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size) + +#define EXC_REAL_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size) + +#define EXC_VIRT_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) + +#define EXC_VIRT_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) + +#define EXC_COMMON_BEGIN(name) \ + USE_TEXT_SECTION(); \ + .balign IFETCH_ALIGN_BYTES; \ + .global name; \ + _ASM_NOKPROBE_SYMBOL(name); \ + DEFINE_FIXED_SYMBOL(name); \ +name: + +#define TRAMP_REAL_BEGIN(name) \ + FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name) + +#define TRAMP_VIRT_BEGIN(name) \ + FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name) + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#define TRAMP_KVM_BEGIN(name) \ + TRAMP_VIRT_BEGIN(name) +#else +#define TRAMP_KVM_BEGIN(name) +#endif + +#define EXC_REAL_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size) + +#define EXC_VIRT_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size) + /* * We're short on space and time in the exception prolog, so we can't * use the normal LOAD_REG_IMMEDIATE macro to load the address of label. -- cgit v1.2.3 From def0db4f9ddc24ad7b37735c34a78e9c3c7978ef Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:44 +1000 Subject: powerpc/64s/exception: Add EXC_HV_OR_STD, which selects HSRR if HVMODE Add EXC_HV_OR_STD and use it to consolidate the 0x500 external interrupt. Executed code is unchanged. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-20-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 102 +++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index febeea021939..33bd99116ae4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -109,6 +109,7 @@ name: addis reg,reg,(ABS_ADDR(label))@h /* Exception register prefixes */ +#define EXC_HV_OR_STD 2 /* depends on HVMODE */ #define EXC_HV 1 #define EXC_STD 0 @@ -205,7 +206,13 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) .abort "Bad maskable vector" .endif - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + bne masked_Hinterrupt + FTR_SECTION_ELSE + bne masked_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr bne masked_Hinterrupt .else bne masked_interrupt @@ -237,7 +244,17 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) .if ! \set_ri xori r10,r10,MSR_RI /* Clear MSR_RI */ .endif - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + mtspr SPRN_HSRR1,r10 + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + mtspr SPRN_SRR1,r10 + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr mfspr r11,SPRN_HSRR0 /* save HSRR0 */ mfspr r12,SPRN_HSRR1 /* and HSRR1 */ mtspr SPRN_HSRR1,r10 @@ -247,7 +264,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtspr SPRN_SRR1,r10 .endif LOAD_HANDLER(r10, \label\()) - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mtspr SPRN_HSRR0,r10 + HRFI_TO_KERNEL + FTR_SECTION_ELSE + mtspr SPRN_SRR0,r10 + RFI_TO_KERNEL + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr mtspr SPRN_HSRR0,r10 HRFI_TO_KERNEL .else @@ -259,14 +284,26 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) .macro EXCEPTION_PROLOG_2_VIRT label, hsrr #ifdef CONFIG_RELOCATABLE - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr mfspr r11,SPRN_HSRR0 /* save HSRR0 */ .else mfspr r11,SPRN_SRR0 /* save SRR0 */ .endif LOAD_HANDLER(r12, \label\()) mtctr r12 - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r12,SPRN_SRR1 /* and HSRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr mfspr r12,SPRN_HSRR1 /* and HSRR1 */ .else mfspr r12,SPRN_SRR1 /* and HSRR1 */ @@ -275,7 +312,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtmsrd r10,1 /* Set RI (EE=0) */ bctr #else - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr mfspr r11,SPRN_HSRR0 /* save HSRR0 */ mfspr r12,SPRN_HSRR1 /* and HSRR1 */ .else @@ -316,7 +361,13 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) .macro KVMTEST hsrr, n lbz r10,HSTATE_IN_GUEST(r13) cmpwi r10,0 - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + bne do_kvm_H\n + FTR_SECTION_ELSE + bne do_kvm_\n + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr bne do_kvm_H\n .else bne do_kvm_\n @@ -342,7 +393,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r12,HSTATE_SCRATCH0(r13) sldi r12,r9,32 /* HSRR variants have the 0x2 bit added to their trap number */ - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + ori r12,r12,(\n + 0x2) + FTR_SECTION_ELSE + ori r12,r12,(\n) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr ori r12,r12,(\n + 0x2) .else ori r12,r12,(\n) @@ -370,7 +427,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) 89: mtocrf 0x80,r9 ld r9,\area+EX_R9(r13) ld r10,\area+EX_R10(r13) - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + b kvmppc_skip_Hinterrupt + FTR_SECTION_ELSE + b kvmppc_skip_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr b kvmppc_skip_Hinterrupt .else b kvmppc_skip_interrupt @@ -469,6 +532,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ .macro EXCEPTION_RESTORE_REGS hsrr /* Move original SRR0 and SRR1 into the respective regs */ ld r9,_MSR(r1) + .if \hsrr == EXC_HV_OR_STD + .error "EXC_HV_OR_STD Not implemented for EXCEPTION_RESTORE_REGS" + .endif .if \hsrr mtspr SPRN_HSRR1,r9 .else @@ -1363,24 +1429,14 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) EXCEPTION_PROLOG_0 PACA_EXGEN -BEGIN_FTR_SECTION - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV, 1 -FTR_SECTION_ELSE - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_STD, 1 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED + EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV_OR_STD, 1 EXC_REAL_END(hardware_interrupt, 0x500, 0x100) EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) EXCEPTION_PROLOG_0 PACA_EXGEN -BEGIN_FTR_SECTION - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV -FTR_SECTION_ELSE - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_STD -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED + EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV_OR_STD EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) TRAMP_KVM(PACA_EXGEN, 0x500) -- cgit v1.2.3 From 9a7a0773d7d2fd1e4c581f42ad75de7872a174dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:45 +1000 Subject: powerpc/64s/exception: Fix performance monitor virt handler The perf virt handler uses EXCEPTION_PROLOG_2_REAL rather than _VIRT. In practice this is okay because the _REAL variant is usable by virt mode interrupts, but should be fixed (and is a performance win). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-21-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 33bd99116ae4..4b685d894ba1 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -750,7 +750,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) \ TRAMP_VIRT_BEGIN(tramp_virt_##name); \ EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 + EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD #define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec, bitmask) \ __EXC_VIRT_OOL_MASKABLE(name, start, size); \ -- cgit v1.2.3 From 5ff79a5ea69f8f5d1131064af9f8c9a8b1bc8266 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:46 +1000 Subject: powerpc/64s/exception: remove 0xb00 handler This vector is not used by any supported processor, and has been implemented as an unknown exception going back to 2.6. There is nothing special about 0xb00, so remove it like other unused vectors. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-22-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4b685d894ba1..d9c953e419bf 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1564,10 +1564,8 @@ EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception) #endif -EXC_REAL(trap_0b, 0xb00, 0x100) -EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) -TRAMP_KVM(PACA_EXGEN, 0xb00) -EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +EXC_REAL_NONE(0xb00, 0x100) +EXC_VIRT_NONE(0x4b00, 0x100) /* * system call / hypercall (0xc00, 0x4c00) -- cgit v1.2.3 From 7299417c8214ce09c0da01d7719c7b1c11503578 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:47 +1000 Subject: powerpc/64s/exception: Replace PROLOG macros and EXC helpers with a gas macro This creates a single macro that generates the exception prolog code, with variants specified by arguments, rather than assorted nested macros for different variants. The increasing length of macro argument list is not nice to read or modify, but this is a temporary condition that will be improved in later changes. No generated code change except BUG line number constants and label names. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-23-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 502 ++++++++++++++--------------------- 1 file changed, 206 insertions(+), 296 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d9c953e419bf..a476bb2f80f3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -43,6 +43,17 @@ .endif #endif +/* + * Following are fixed section helper macros. + * + * EXC_REAL_BEGIN/END - real, unrelocated exception vectors + * EXC_VIRT_BEGIN/END - virt (AIL), unrelocated exception vectors + * TRAMP_REAL_BEGIN - real, unrelocated helpers (virt may call these) + * TRAMP_VIRT_BEGIN - virt, unreloc helpers (in practice, real can use) + * TRAMP_KVM_BEGIN - KVM handlers, these are put into real, unrelocated + * EXC_COMMON - After switching to virtual, relocated mode. + */ + #define EXC_REAL_BEGIN(name, start, size) \ FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size) @@ -589,196 +600,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #endif /* - * Following are the BOOK3S exception handler helper macros. - * Handlers come in a number of types, and each type has a number of varieties. - * - * EXC_REAL_* - real, unrelocated exception vectors - * EXC_VIRT_* - virt (AIL), unrelocated exception vectors - * TRAMP_REAL_* - real, unrelocated helpers (virt can call these) - * TRAMP_VIRT_* - virt, unreloc helpers (in practice, real can use) - * TRAMP_KVM - KVM handlers that get put into real, unrelocated - * EXC_COMMON - virt, relocated common handlers - * - * The EXC handlers are given a name, and branch to name_common, or the - * appropriate KVM or masking function. Vector handler verieties are as - * follows: - * - * EXC_{REAL|VIRT}_BEGIN/END - used to open-code the exception - * - * EXC_{REAL|VIRT} - standard exception - * - * EXC_{REAL|VIRT}_suffix - * where _suffix is: - * - _MASKABLE - maskable exception - * - _OOL - out of line with trampoline to common handler - * - _HV - HV exception - * - * There can be combinations, e.g., EXC_VIRT_OOL_MASKABLE_HV + * This is the BOOK3S interrupt entry code macro. * - * KVM handlers come in the following verieties: - * TRAMP_KVM - * TRAMP_KVM_SKIP - * TRAMP_KVM_HV - * TRAMP_KVM_HV_SKIP - * - * COMMON handlers come in the following verieties: - * EXC_COMMON_BEGIN/END - used to open-code the handler - * EXC_COMMON - * EXC_COMMON_ASYNC - * - * TRAMP_REAL and TRAMP_VIRT can be used with BEGIN/END. KVM - * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers. + * This can result in one of several things happening: + * - Branch to the _common handler, relocated, in virtual mode. + * These are normal interrupts (synchronous and asynchronous) handled by + * the kernel. + * - Branch to KVM, relocated but real mode interrupts remain in real mode. + * These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by + * / intended for host or guest kernel, but KVM must always be involved + * because the machine state is set for guest execution. + * - Branch to the masked handler, unrelocated. + * These occur when maskable asynchronous interrupts are taken with the + * irq_soft_mask set. + * - Branch to an "early" handler in real mode but relocated. + * This is done if early=1. MCE and HMI use these to handle errors in real + * mode. + * - Fall through and continue executing in real, unrelocated mode. + * This is done if early=2. */ +.macro INT_HANDLER name, vec, ool, early, virt, hsrr, area, ri, dar, dsisr, bitmask, kvm + EXCEPTION_PROLOG_0 \area + .if \ool + .if !\virt + b tramp_real_\name + .pushsection .text + TRAMP_REAL_BEGIN(tramp_real_\name) + .else + b tramp_virt_\name + .pushsection .text + TRAMP_VIRT_BEGIN(tramp_virt_\name) + .endif + .endif + EXCEPTION_PROLOG_1 \hsrr, \area, \kvm, \vec, \dar, \dsisr, \bitmask + .if \early == 2 + /* nothing more */ + .elseif \early + mfctr r10 /* save ctr, even for !RELOCATABLE */ + BRANCH_TO_C000(r11, \name\()_early_common) + .elseif !\virt + EXCEPTION_PROLOG_2_REAL \name\()_common, \hsrr, \ri + .else + EXCEPTION_PROLOG_2_VIRT \name\()_common, \hsrr + .endif + .if \ool + .popsection + .endif +.endm -#define __EXC_REAL(name, start, size, area) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 area ; \ - EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_REAL(name, start, size) \ - __EXC_REAL(name, start, size, PACA_EXGEN) - -#define __EXC_VIRT(name, start, size, realvec, area) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 area ; \ - EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0, 0, 0; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \ - EXC_VIRT_END(name, start, size) - -#define EXC_VIRT(name, start, size, realvec) \ - __EXC_VIRT(name, start, size, realvec, PACA_EXGEN) - -#define EXC_REAL_MASKABLE(name, start, size, bitmask) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_VIRT_MASKABLE(name, start, size, realvec, bitmask) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \ - EXC_VIRT_END(name, start, size) - -#define EXC_REAL_HV(name, start, size) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN; \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_VIRT_HV(name, start, size, realvec) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN; \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV ; \ - EXC_VIRT_END(name, start, size) - -#define __EXC_REAL_OOL(name, start, size) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - b tramp_real_##name ; \ - EXC_REAL_END(name, start, size) - -#define __TRAMP_REAL_OOL(name, vec) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_REAL_OOL(name, start, size) \ - __EXC_REAL_OOL(name, start, size); \ - __TRAMP_REAL_OOL(name, start) - -#define __EXC_REAL_OOL_MASKABLE(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_MASKABLE(name, vec, bitmask) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_REAL_OOL_MASKABLE(name, start, size, bitmask) \ - __EXC_REAL_OOL_MASKABLE(name, start, size); \ - __TRAMP_REAL_OOL_MASKABLE(name, start, bitmask) - -#define __EXC_REAL_OOL_HV(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_HV(name, vec) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 - -#define EXC_REAL_OOL_HV(name, start, size) \ - __EXC_REAL_OOL_HV(name, start, size); \ - __TRAMP_REAL_OOL_HV(name, start) - -#define __EXC_REAL_OOL_MASKABLE_HV(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec, bitmask) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 - -#define EXC_REAL_OOL_MASKABLE_HV(name, start, size, bitmask) \ - __EXC_REAL_OOL_MASKABLE_HV(name, start, size); \ - __TRAMP_REAL_OOL_MASKABLE_HV(name, start, bitmask) - -#define __EXC_VIRT_OOL(name, start, size) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - b tramp_virt_##name; \ - EXC_VIRT_END(name, start, size) - -#define __TRAMP_VIRT_OOL(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD - -#define EXC_VIRT_OOL(name, start, size, realvec) \ - __EXC_VIRT_OOL(name, start, size); \ - __TRAMP_VIRT_OOL(name, realvec) - -#define __EXC_VIRT_OOL_MASKABLE(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD - -#define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec, bitmask) \ - __EXC_VIRT_OOL_MASKABLE(name, start, size); \ - __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) - -#define __EXC_VIRT_OOL_HV(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_HV(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV - -#define EXC_VIRT_OOL_HV(name, start, size, realvec) \ - __EXC_VIRT_OOL_HV(name, start, size); \ - __TRAMP_VIRT_OOL_HV(name, realvec) - -#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV - -#define EXC_VIRT_OOL_MASKABLE_HV(name, start, size, realvec, bitmask) \ - __EXC_VIRT_OOL_MASKABLE_HV(name, start, size); \ - __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask) #define TRAMP_KVM(area, n) \ TRAMP_KVM_BEGIN(do_kvm_##n); \ @@ -943,9 +812,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif - EXCEPTION_PROLOG_0 PACA_EXNMI - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 1, 0x100, 0, 0, 0 - EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0 + INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 1 /* * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is * being used, so a nested NMI exception would corrupt it. @@ -975,9 +842,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) */ TRAMP_REAL_BEGIN(system_reset_fwnmi) /* See comment at system_reset exception, don't turn on RI */ - EXCEPTION_PROLOG_0 PACA_EXNMI - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 0, 0x100, 0, 0, 0 - EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0 + INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 0 #endif /* CONFIG_PPC_PSERIES */ @@ -1040,10 +905,7 @@ EXC_COMMON_BEGIN(system_reset_common) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) - EXCEPTION_PROLOG_0 PACA_EXMC - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0 - mfctr r10 /* save ctr, even for !RELOCATABLE */ - BRANCH_TO_C000(r11, machine_check_early_common) + INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0 /* * MSR_RI is not enabled, because PACA_EXMC is being used, so a * nested machine check corrupts it. machine_check_common enables @@ -1055,10 +917,7 @@ EXC_VIRT_NONE(0x4200, 0x100) #ifdef CONFIG_PPC_PSERIES TRAMP_REAL_BEGIN(machine_check_fwnmi) /* See comment at machine_check exception, don't turn on RI */ - EXCEPTION_PROLOG_0 PACA_EXMC - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 1, 1, 0 - mfctr r10 /* save ctr */ - BRANCH_TO_C000(r11, machine_check_early_common) + INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0 #endif TRAMP_KVM_SKIP(PACA_EXMC, 0x200) @@ -1209,9 +1068,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_CFAR) MACHINE_CHECK_HANDLER_WINDUP /* See comment at machine_check exception, don't turn on RI */ - EXCEPTION_PROLOG_0 PACA_EXMC - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 - EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 + INT_HANDLER machine_check, 0x200, 0, 0, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 1 EXC_COMMON_BEGIN(machine_check_common) /* @@ -1297,18 +1154,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) EXC_REAL_BEGIN(data_access, 0x300, 0x80) - EXCEPTION_PROLOG_0 PACA_EXGEN - b tramp_real_data_access + INT_HANDLER data_access, 0x300, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1 EXC_REAL_END(data_access, 0x300, 0x80) - -TRAMP_REAL_BEGIN(tramp_real_data_access) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 1, 1, 0 - EXCEPTION_PROLOG_2_REAL data_access_common, EXC_STD, 1 - EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x300, 1, 1, 0 -EXCEPTION_PROLOG_2_VIRT data_access_common, EXC_STD + INT_HANDLER data_access, 0x300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0 EXC_VIRT_END(data_access, 0x4300, 0x80) TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) @@ -1336,18 +1185,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - EXCEPTION_PROLOG_0 PACA_EXSLB - b tramp_real_data_access_slb + INT_HANDLER data_access_slb, 0x380, 1, 0, 0, 0, PACA_EXSLB, 1, 1, 0, 0, 1 EXC_REAL_END(data_access_slb, 0x380, 0x80) - -TRAMP_REAL_BEGIN(tramp_real_data_access_slb) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 1, 0x380, 1, 0, 0 - EXCEPTION_PROLOG_2_REAL data_access_slb_common, EXC_STD, 1 - EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - EXCEPTION_PROLOG_0 PACA_EXSLB - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 0, 0x380, 1, 0, 0 - EXCEPTION_PROLOG_2_VIRT data_access_slb_common, EXC_STD + INT_HANDLER data_access_slb, 0x380, 0, 0, 1, 0, PACA_EXSLB, 1, 1, 0, 0, 0 EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -1378,8 +1219,13 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) b ret_from_except -EXC_REAL(instruction_access, 0x400, 0x80) -EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) +EXC_REAL_BEGIN(instruction_access, 0x400, 0x80) + INT_HANDLER instruction_access, 0x400, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(instruction_access, 0x400, 0x80) +EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) + INT_HANDLER instruction_access, 0x400, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(instruction_access, 0x4400, 0x80) + TRAMP_KVM(PACA_EXGEN, 0x400) EXC_COMMON_BEGIN(instruction_access_common) @@ -1398,8 +1244,12 @@ MMU_FTR_SECTION_ELSE ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) -__EXC_REAL(instruction_access_slb, 0x480, 0x80, PACA_EXSLB) -__EXC_VIRT(instruction_access_slb, 0x4480, 0x80, 0x480, PACA_EXSLB) +EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) + INT_HANDLER instruction_access_slb, 0x480, 0, 0, 0, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 1 +EXC_REAL_END(instruction_access_slb, 0x480, 0x80) +EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) + INT_HANDLER instruction_access_slb, 0x480, 0, 0, 1, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 0 +EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) EXC_COMMON_BEGIN(instruction_access_slb_common) @@ -1426,17 +1276,11 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) bl do_bad_slb_fault b ret_from_except - EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV_OR_STD, 1 + INT_HANDLER hardware_interrupt, 0x500, 0, 0, 0, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 EXC_REAL_END(hardware_interrupt, 0x500, 0x100) - EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV_OR_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV_OR_STD + INT_HANDLER hardware_interrupt, 0x500, 0, 0, 1, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) TRAMP_KVM(PACA_EXGEN, 0x500) @@ -1445,15 +1289,10 @@ EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) EXC_REAL_BEGIN(alignment, 0x600, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x600, 1, 1, 0 - EXCEPTION_PROLOG_2_REAL alignment_common, EXC_STD, 1 + INT_HANDLER alignment, 0x600, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1 EXC_REAL_END(alignment, 0x600, 0x100) - EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x600, 1, 1, 0 - EXCEPTION_PROLOG_2_VIRT alignment_common, EXC_STD + INT_HANDLER alignment, 0x600, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0 EXC_VIRT_END(alignment, 0x4600, 0x100) TRAMP_KVM(PACA_EXGEN, 0x600) @@ -1470,8 +1309,12 @@ EXC_COMMON_BEGIN(alignment_common) b ret_from_except -EXC_REAL(program_check, 0x700, 0x100) -EXC_VIRT(program_check, 0x4700, 0x100, 0x700) +EXC_REAL_BEGIN(program_check, 0x700, 0x100) + INT_HANDLER program_check, 0x700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(program_check, 0x700, 0x100) +EXC_VIRT_BEGIN(program_check, 0x4700, 0x100) + INT_HANDLER program_check, 0x700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(program_check, 0x4700, 0x100) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) /* @@ -1508,8 +1351,12 @@ EXC_COMMON_BEGIN(program_check_common) b ret_from_except -EXC_REAL(fp_unavailable, 0x800, 0x100) -EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800) +EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100) + INT_HANDLER fp_unavailable, 0x800, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(fp_unavailable, 0x800, 0x100) +EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) + INT_HANDLER fp_unavailable, 0x800, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) TRAMP_KVM(PACA_EXGEN, 0x800) EXC_COMMON_BEGIN(fp_unavailable_common) EXCEPTION_COMMON(PACA_EXGEN, 0x800) @@ -1542,20 +1389,32 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif -EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED) -EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900, IRQS_DISABLED) +EXC_REAL_BEGIN(decrementer, 0x900, 0x80) + INT_HANDLER decrementer, 0x900, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 +EXC_REAL_END(decrementer, 0x900, 0x80) +EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) + INT_HANDLER decrementer, 0x900, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0 +EXC_VIRT_END(decrementer, 0x4900, 0x80) TRAMP_KVM(PACA_EXGEN, 0x900) EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) -EXC_REAL_HV(hdecrementer, 0x980, 0x80) -EXC_VIRT_HV(hdecrementer, 0x4980, 0x80, 0x980) +EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80) + INT_HANDLER hdecrementer, 0x980, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(hdecrementer, 0x980, 0x80) +EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) + INT_HANDLER hdecrementer, 0x980, 0, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_VIRT_END(hdecrementer, 0x4980, 0x80) TRAMP_KVM_HV(PACA_EXGEN, 0x980) EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) -EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100, IRQS_DISABLED) -EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00, IRQS_DISABLED) +EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100) + INT_HANDLER doorbell_super, 0xa00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 +EXC_REAL_END(doorbell_super, 0xa00, 0x100) +EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) + INT_HANDLER doorbell_super, 0xa00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0 +EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) TRAMP_KVM(PACA_EXGEN, 0xa00) #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) @@ -1669,7 +1528,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) EXC_REAL_BEGIN(system_call, 0xc00, 0x100) SYSTEM_CALL 0 EXC_REAL_END(system_call, 0xc00, 0x100) - EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) SYSTEM_CALL 1 EXC_VIRT_END(system_call, 0x4c00, 0x100) @@ -1699,13 +1557,22 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00) #endif -EXC_REAL(single_step, 0xd00, 0x100) -EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00) +EXC_REAL_BEGIN(single_step, 0xd00, 0x100) + INT_HANDLER single_step, 0xd00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(single_step, 0xd00, 0x100) +EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) + INT_HANDLER single_step, 0xd00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(single_step, 0x4d00, 0x100) TRAMP_KVM(PACA_EXGEN, 0xd00) EXC_COMMON(single_step_common, 0xd00, single_step_exception) -EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20) -EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x20, 0xe00) + +EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20) + INT_HANDLER h_data_storage, 0xe00, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(h_data_storage, 0xe00, 0x20) +EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) + INT_HANDLER h_data_storage, 0xe00, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) EXC_COMMON_BEGIN(h_data_storage_common) mfspr r10,SPRN_HDAR @@ -1729,14 +1596,22 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) b ret_from_except -EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20) -EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x20, 0xe20) +EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20) + INT_HANDLER h_instr_storage, 0xe20, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(h_instr_storage, 0xe20, 0x20) +EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) + INT_HANDLER h_instr_storage, 0xe20, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe20) EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) -EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20) -EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40) +EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20) + INT_HANDLER emulation_assist, 0xe40, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(emulation_assist, 0xe40, 0x20) +EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) + INT_HANDLER emulation_assist, 0xe40, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe40) EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) @@ -1747,15 +1622,10 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) * mode. */ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) - EXCEPTION_PROLOG_0 PACA_EXGEN - b hmi_exception_early + INT_HANDLER hmi_exception, 0xe60, 1, 1, 0, EXC_HV, PACA_EXGEN, 0, 0, 0, 0, 1 EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe60) -TRAMP_REAL_BEGIN(hmi_exception_early) - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, 0 - mfctr r10 /* save ctr, even for !RELOCATABLE */ - BRANCH_TO_C000(r11, hmi_exception_early_common) EXC_COMMON_BEGIN(hmi_exception_early_common) mtctr r10 /* Restore ctr */ @@ -1782,9 +1652,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) * firmware. */ EXCEPTION_RESTORE_REGS EXC_HV - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hmi_exception_common, EXC_HV, 1 + INT_HANDLER hmi_exception, 0xe60, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 EXC_COMMON_BEGIN(hmi_exception_common) EXCEPTION_COMMON(PACA_EXGEN, 0xe60) @@ -1796,8 +1664,13 @@ EXC_COMMON_BEGIN(hmi_exception_common) bl handle_hmi_exception b ret_from_except -EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20, IRQS_DISABLED) -EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80, IRQS_DISABLED) + +EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20) + INT_HANDLER h_doorbell, 0xe80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 +EXC_REAL_END(h_doorbell, 0xe80, 0x20) +EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) + INT_HANDLER h_doorbell, 0xe80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 +EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe80) #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) @@ -1806,8 +1679,12 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception) #endif -EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20, IRQS_DISABLED) -EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0, IRQS_DISABLED) +EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20) + INT_HANDLER h_virt_irq, 0xea0, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 +EXC_REAL_END(h_virt_irq, 0xea0, 0x20) +EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) + INT_HANDLER h_virt_irq, 0xea0, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 +EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xea0) EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) @@ -1818,14 +1695,22 @@ EXC_REAL_NONE(0xee0, 0x20) EXC_VIRT_NONE(0x4ee0, 0x20) -EXC_REAL_OOL_MASKABLE(performance_monitor, 0xf00, 0x20, IRQS_PMI_DISABLED) -EXC_VIRT_OOL_MASKABLE(performance_monitor, 0x4f00, 0x20, 0xf00, IRQS_PMI_DISABLED) +EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20) + INT_HANDLER performance_monitor, 0xf00, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 1 +EXC_REAL_END(performance_monitor, 0xf00, 0x20) +EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) + INT_HANDLER performance_monitor, 0xf00, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 0 +EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf00) EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) -EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20) -EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20) +EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20) + INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(altivec_unavailable, 0xf20, 0x20) +EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) + INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf20) EXC_COMMON_BEGIN(altivec_unavailable_common) EXCEPTION_COMMON(PACA_EXGEN, 0xf20) @@ -1861,8 +1746,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) b ret_from_except -EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20) -EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40) +EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20) + INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(vsx_unavailable, 0xf40, 0x20) +EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) + INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf40) EXC_COMMON_BEGIN(vsx_unavailable_common) EXCEPTION_COMMON(PACA_EXGEN, 0xf40) @@ -1897,14 +1786,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) b ret_from_except -EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20) -EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60) +EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20) + INT_HANDLER facility_unavailable, 0xf60, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(facility_unavailable, 0xf60, 0x20) +EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) + INT_HANDLER facility_unavailable, 0xf60, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf60) EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) -EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20) -EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80) +EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20) + INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20) +EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) + INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xf80) EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) @@ -1922,7 +1819,9 @@ EXC_REAL_NONE(0x1100, 0x100) EXC_VIRT_NONE(0x5100, 0x100) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_system_error, 0x1200, 0x100) +EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) + INT_HANDLER cbe_system_error, 0x1200, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200) EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) @@ -1932,24 +1831,26 @@ EXC_VIRT_NONE(0x5200, 0x100) #endif -EXC_REAL(instruction_breakpoint, 0x1300, 0x100) -EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300) +EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100) + INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100) +EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) + INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300) EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) + EXC_REAL_NONE(0x1400, 0x100) EXC_VIRT_NONE(0x5400, 0x100) EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 0, 0x1500, 0, 0, 0 - + INT_HANDLER denorm_exception_hv, 0x1500, 0, 2, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0 #ifdef CONFIG_PPC_DENORMALISATION mfspr r10,SPRN_HSRR1 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ bne+ denorm_assist #endif - KVMTEST EXC_HV 0x1500 EXCEPTION_PROLOG_2_REAL denorm_common, EXC_HV, 1 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) @@ -2037,7 +1938,9 @@ EXC_COMMON(denorm_common, 0x1500, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100) +EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) + INT_HANDLER cbe_maintenance, 0x1600, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600) EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) @@ -2047,8 +1950,12 @@ EXC_VIRT_NONE(0x5600, 0x100) #endif -EXC_REAL(altivec_assist, 0x1700, 0x100) -EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700) +EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100) + INT_HANDLER altivec_assist, 0x1700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(altivec_assist, 0x1700, 0x100) +EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) + INT_HANDLER altivec_assist, 0x1700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 +EXC_VIRT_END(altivec_assist, 0x5700, 0x100) TRAMP_KVM(PACA_EXGEN, 0x1700) #ifdef CONFIG_ALTIVEC EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) @@ -2058,7 +1965,9 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_thermal, 0x1800, 0x100) +EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) + INT_HANDLER cbe_thermal, 0x1800, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 +EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800) EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) @@ -2067,6 +1976,7 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif + #ifdef CONFIG_PPC_WATCHDOG #define MASKED_DEC_HANDLER_LABEL 3f -- cgit v1.2.3 From 9b40f62b8a49f0c922eb4b6cf6502f1307745c98 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 29 Aug 2019 23:36:08 +1000 Subject: powerpc/64s/exceptions: Use keyword params to shorten arg lists The argument lists for the INT_HANDLER macro are getting a bit unwieldy. Use keyword parameters with default values to shorten them. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190830011426.16810-1-mpe@ellerman.id.au --- arch/powerpc/kernel/exceptions-64s.S | 120 +++++++++++++++++------------------ 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a476bb2f80f3..10037981ff2a 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -619,7 +619,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * - Fall through and continue executing in real, unrelocated mode. * This is done if early=2. */ -.macro INT_HANDLER name, vec, ool, early, virt, hsrr, area, ri, dar, dsisr, bitmask, kvm +.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 EXCEPTION_PROLOG_0 \area .if \ool .if !\virt @@ -812,7 +812,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif - INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 1 + INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0, kvm=1 /* * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is * being used, so a nested NMI exception would corrupt it. @@ -842,7 +842,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) */ TRAMP_REAL_BEGIN(system_reset_fwnmi) /* See comment at system_reset exception, don't turn on RI */ - INT_HANDLER system_reset, 0x100, 0, 0, 0, EXC_STD, PACA_EXNMI, 0, 0, 0, 0, 0 + INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0 #endif /* CONFIG_PPC_PSERIES */ @@ -905,7 +905,7 @@ EXC_COMMON_BEGIN(system_reset_common) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) - INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0 + INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 /* * MSR_RI is not enabled, because PACA_EXMC is being used, so a * nested machine check corrupts it. machine_check_common enables @@ -917,7 +917,7 @@ EXC_VIRT_NONE(0x4200, 0x100) #ifdef CONFIG_PPC_PSERIES TRAMP_REAL_BEGIN(machine_check_fwnmi) /* See comment at machine_check exception, don't turn on RI */ - INT_HANDLER machine_check, 0x200, 0, 1, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 0 + INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 #endif TRAMP_KVM_SKIP(PACA_EXMC, 0x200) @@ -1068,7 +1068,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_CFAR) MACHINE_CHECK_HANDLER_WINDUP /* See comment at machine_check exception, don't turn on RI */ - INT_HANDLER machine_check, 0x200, 0, 0, 0, EXC_STD, PACA_EXMC, 0, 1, 1, 0, 1 + INT_HANDLER machine_check, 0x200, area=PACA_EXMC, ri=0, dar=1, dsisr=1, kvm=1 EXC_COMMON_BEGIN(machine_check_common) /* @@ -1154,10 +1154,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) EXC_REAL_BEGIN(data_access, 0x300, 0x80) - INT_HANDLER data_access, 0x300, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1 + INT_HANDLER data_access, 0x300, ool=1, dar=1, dsisr=1, kvm=1 EXC_REAL_END(data_access, 0x300, 0x80) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) - INT_HANDLER data_access, 0x300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0 + INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1 EXC_VIRT_END(data_access, 0x4300, 0x80) TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) @@ -1185,10 +1185,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - INT_HANDLER data_access_slb, 0x380, 1, 0, 0, 0, PACA_EXSLB, 1, 1, 0, 0, 1 + INT_HANDLER data_access_slb, 0x380, ool=1, area=PACA_EXSLB, dar=1, kvm=1 EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - INT_HANDLER data_access_slb, 0x380, 0, 0, 1, 0, PACA_EXSLB, 1, 1, 0, 0, 0 + INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1 EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -1220,10 +1220,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(instruction_access, 0x400, 0x80) - INT_HANDLER instruction_access, 0x400, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER instruction_access, 0x400, kvm=1 EXC_REAL_END(instruction_access, 0x400, 0x80) EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) - INT_HANDLER instruction_access, 0x400, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER instruction_access, 0x400, virt=1 EXC_VIRT_END(instruction_access, 0x4400, 0x80) TRAMP_KVM(PACA_EXGEN, 0x400) @@ -1245,10 +1245,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) - INT_HANDLER instruction_access_slb, 0x480, 0, 0, 0, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 1 + INT_HANDLER instruction_access_slb, 0x480, area=PACA_EXSLB, kvm=1 EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) - INT_HANDLER instruction_access_slb, 0x480, 0, 0, 1, EXC_STD, PACA_EXSLB, 1, 0, 0, 0, 0 + INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) @@ -1277,10 +1277,10 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) b ret_from_except EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) - INT_HANDLER hardware_interrupt, 0x500, 0, 0, 0, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER hardware_interrupt, 0x500, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_REAL_END(hardware_interrupt, 0x500, 0x100) EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) - INT_HANDLER hardware_interrupt, 0x500, 0, 0, 1, EXC_HV_OR_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) TRAMP_KVM(PACA_EXGEN, 0x500) @@ -1289,10 +1289,10 @@ EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) EXC_REAL_BEGIN(alignment, 0x600, 0x100) - INT_HANDLER alignment, 0x600, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 1 + INT_HANDLER alignment, 0x600, dar=1, dsisr=1, kvm=1 EXC_REAL_END(alignment, 0x600, 0x100) EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) - INT_HANDLER alignment, 0x600, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 1, 1, 0, 0 + INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1 EXC_VIRT_END(alignment, 0x4600, 0x100) TRAMP_KVM(PACA_EXGEN, 0x600) @@ -1310,10 +1310,10 @@ EXC_COMMON_BEGIN(alignment_common) EXC_REAL_BEGIN(program_check, 0x700, 0x100) - INT_HANDLER program_check, 0x700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER program_check, 0x700, kvm=1 EXC_REAL_END(program_check, 0x700, 0x100) EXC_VIRT_BEGIN(program_check, 0x4700, 0x100) - INT_HANDLER program_check, 0x700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER program_check, 0x700, virt=1 EXC_VIRT_END(program_check, 0x4700, 0x100) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) @@ -1352,10 +1352,10 @@ EXC_COMMON_BEGIN(program_check_common) EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100) - INT_HANDLER fp_unavailable, 0x800, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER fp_unavailable, 0x800, kvm=1 EXC_REAL_END(fp_unavailable, 0x800, 0x100) EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) - INT_HANDLER fp_unavailable, 0x800, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER fp_unavailable, 0x800, virt=1 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) TRAMP_KVM(PACA_EXGEN, 0x800) EXC_COMMON_BEGIN(fp_unavailable_common) @@ -1390,30 +1390,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) EXC_REAL_BEGIN(decrementer, 0x900, 0x80) - INT_HANDLER decrementer, 0x900, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER decrementer, 0x900, ool=1, bitmask=IRQS_DISABLED, kvm=1 EXC_REAL_END(decrementer, 0x900, 0x80) EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) - INT_HANDLER decrementer, 0x900, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0 + INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(decrementer, 0x4900, 0x80) TRAMP_KVM(PACA_EXGEN, 0x900) EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80) - INT_HANDLER hdecrementer, 0x980, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER hdecrementer, 0x980, hsrr=EXC_HV, kvm=1 EXC_REAL_END(hdecrementer, 0x980, 0x80) EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) - INT_HANDLER hdecrementer, 0x980, 0, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(hdecrementer, 0x4980, 0x80) TRAMP_KVM_HV(PACA_EXGEN, 0x980) EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100) - INT_HANDLER doorbell_super, 0xa00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER doorbell_super, 0xa00, bitmask=IRQS_DISABLED, kvm=1 EXC_REAL_END(doorbell_super, 0xa00, 0x100) EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) - INT_HANDLER doorbell_super, 0xa00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 0 + INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) TRAMP_KVM(PACA_EXGEN, 0xa00) #ifdef CONFIG_PPC_DOORBELL @@ -1558,20 +1558,20 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00) EXC_REAL_BEGIN(single_step, 0xd00, 0x100) - INT_HANDLER single_step, 0xd00, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER single_step, 0xd00, kvm=1 EXC_REAL_END(single_step, 0xd00, 0x100) EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) - INT_HANDLER single_step, 0xd00, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER single_step, 0xd00, virt=1 EXC_VIRT_END(single_step, 0x4d00, 0x100) TRAMP_KVM(PACA_EXGEN, 0xd00) EXC_COMMON(single_step_common, 0xd00, single_step_exception) EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20) - INT_HANDLER h_data_storage, 0xe00, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(h_data_storage, 0xe00, 0x20) EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) - INT_HANDLER h_data_storage, 0xe00, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) EXC_COMMON_BEGIN(h_data_storage_common) @@ -1597,20 +1597,20 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20) - INT_HANDLER h_instr_storage, 0xe20, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER h_instr_storage, 0xe20, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(h_instr_storage, 0xe20, 0x20) EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) - INT_HANDLER h_instr_storage, 0xe20, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe20) EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20) - INT_HANDLER emulation_assist, 0xe40, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER emulation_assist, 0xe40, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(emulation_assist, 0xe40, 0x20) EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) - INT_HANDLER emulation_assist, 0xe40, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe40) EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) @@ -1622,7 +1622,7 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) * mode. */ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) - INT_HANDLER hmi_exception, 0xe60, 1, 1, 0, EXC_HV, PACA_EXGEN, 0, 0, 0, 0, 1 + INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1 EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe60) @@ -1652,7 +1652,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) * firmware. */ EXCEPTION_RESTORE_REGS EXC_HV - INT_HANDLER hmi_exception, 0xe60, 0, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_COMMON_BEGIN(hmi_exception_common) EXCEPTION_COMMON(PACA_EXGEN, 0xe60) @@ -1666,10 +1666,10 @@ EXC_COMMON_BEGIN(hmi_exception_common) EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20) - INT_HANDLER h_doorbell, 0xe80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER h_doorbell, 0xe80, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_REAL_END(h_doorbell, 0xe80, 0x20) EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) - INT_HANDLER h_doorbell, 0xe80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe80) #ifdef CONFIG_PPC_DOORBELL @@ -1680,10 +1680,10 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception) EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20) - INT_HANDLER h_virt_irq, 0xea0, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER h_virt_irq, 0xea0, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_REAL_END(h_virt_irq, 0xea0, 0x20) EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) - INT_HANDLER h_virt_irq, 0xea0, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, IRQS_DISABLED, 1 + INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xea0) EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) @@ -1696,20 +1696,20 @@ EXC_VIRT_NONE(0x4ee0, 0x20) EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20) - INT_HANDLER performance_monitor, 0xf00, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 1 + INT_HANDLER performance_monitor, 0xf00, ool=1, bitmask=IRQS_PMI_DISABLED, kvm=1 EXC_REAL_END(performance_monitor, 0xf00, 0x20) EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) - INT_HANDLER performance_monitor, 0xf00, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, IRQS_PMI_DISABLED, 0 + INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf00) EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20) - INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER altivec_unavailable, 0xf20, ool=1, kvm=1 EXC_REAL_END(altivec_unavailable, 0xf20, 0x20) EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) - INT_HANDLER altivec_unavailable, 0xf20, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf20) EXC_COMMON_BEGIN(altivec_unavailable_common) @@ -1747,10 +1747,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20) - INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER vsx_unavailable, 0xf40, ool=1, kvm=1 EXC_REAL_END(vsx_unavailable, 0xf40, 0x20) EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) - INT_HANDLER vsx_unavailable, 0xf40, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf40) EXC_COMMON_BEGIN(vsx_unavailable_common) @@ -1787,20 +1787,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20) - INT_HANDLER facility_unavailable, 0xf60, 1, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER facility_unavailable, 0xf60, ool=1, kvm=1 EXC_REAL_END(facility_unavailable, 0xf60, 0x20) EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) - INT_HANDLER facility_unavailable, 0xf60, 1, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1 EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) TRAMP_KVM(PACA_EXGEN, 0xf60) EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20) - INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER h_facility_unavailable, 0xf80, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20) EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) - INT_HANDLER h_facility_unavailable, 0xf80, 1, 0, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xf80) EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) @@ -1820,7 +1820,7 @@ EXC_VIRT_NONE(0x5100, 0x100) #ifdef CONFIG_CBE_RAS EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) - INT_HANDLER cbe_system_error, 0x1200, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200) @@ -1832,10 +1832,10 @@ EXC_VIRT_NONE(0x5200, 0x100) EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100) - INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER instruction_breakpoint, 0x1300, kvm=1 EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100) EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) - INT_HANDLER instruction_breakpoint, 0x1300, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER instruction_breakpoint, 0x1300, virt=1 EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300) EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) @@ -1845,7 +1845,7 @@ EXC_REAL_NONE(0x1400, 0x100) EXC_VIRT_NONE(0x5400, 0x100) EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) - INT_HANDLER denorm_exception_hv, 0x1500, 0, 2, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER denorm_exception_hv, 0x1500, early=2, hsrr=EXC_HV #ifdef CONFIG_PPC_DENORMALISATION mfspr r10,SPRN_HSRR1 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ @@ -1939,7 +1939,7 @@ EXC_COMMON(denorm_common, 0x1500, unknown_exception) #ifdef CONFIG_CBE_RAS EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) - INT_HANDLER cbe_maintenance, 0x1600, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600) @@ -1951,10 +1951,10 @@ EXC_VIRT_NONE(0x5600, 0x100) EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100) - INT_HANDLER altivec_assist, 0x1700, 0, 0, 0, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER altivec_assist, 0x1700, kvm=1 EXC_REAL_END(altivec_assist, 0x1700, 0x100) EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) - INT_HANDLER altivec_assist, 0x1700, 0, 0, 1, EXC_STD, PACA_EXGEN, 1, 0, 0, 0, 0 + INT_HANDLER altivec_assist, 0x1700, virt=1 EXC_VIRT_END(altivec_assist, 0x5700, 0x100) TRAMP_KVM(PACA_EXGEN, 0x1700) #ifdef CONFIG_ALTIVEC @@ -1966,7 +1966,7 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception) #ifdef CONFIG_CBE_RAS EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) - INT_HANDLER cbe_thermal, 0x1800, 1, 0, 0, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 1 + INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800) -- cgit v1.2.3 From d29768e13cf6b50bd54a690e2ac52ab71465e2eb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:48 +1000 Subject: powerpc/64s/exception: remove EXCEPTION_PROLOG_0/1, rename _2 EXCEPTION_PROLOG_0 and _1 have only a single caller, so expand them into it. Rename EXCEPTION_PROLOG_2_REAL to INT_SAVE_SRR_AND_JUMP and EXCEPTION_PROLOG_2_VIRT to INT_VIRT_SAVE_SRR_AND_JUMP, which are more descriptive. No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-24-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 151 +++++++++++++++++------------------ 1 file changed, 73 insertions(+), 78 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 10037981ff2a..1ae2a8d59aa0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -180,77 +180,7 @@ BEGIN_FTR_SECTION_NESTED(943) \ std ra,offset(r13); \ END_FTR_SECTION_NESTED(ftr,ftr,943) -.macro EXCEPTION_PROLOG_0 area - SET_SCRATCH0(r13) /* save r13 */ - GET_PACA(r13) - std r9,\area\()+EX_R9(r13) /* save r9 */ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) - HMT_MEDIUM - std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ - OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) -.endm - -.macro EXCEPTION_PROLOG_1 hsrr, area, kvm, vec, dar, dsisr, bitmask - OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) - OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) - INTERRUPT_TO_KERNEL - SAVE_CTR(r10, \area\()) - mfcr r9 - .if \kvm - KVMTEST \hsrr \vec - .endif - .if \bitmask - lbz r10,PACAIRQSOFTMASK(r13) - andi. r10,r10,\bitmask - /* Associate vector numbers with bits in paca->irq_happened */ - .if \vec == 0x500 || \vec == 0xea0 - li r10,PACA_IRQ_EE - .elseif \vec == 0x900 - li r10,PACA_IRQ_DEC - .elseif \vec == 0xa00 || \vec == 0xe80 - li r10,PACA_IRQ_DBELL - .elseif \vec == 0xe60 - li r10,PACA_IRQ_HMI - .elseif \vec == 0xf00 - li r10,PACA_IRQ_PMI - .else - .abort "Bad maskable vector" - .endif - - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - bne masked_Hinterrupt - FTR_SECTION_ELSE - bne masked_interrupt - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - bne masked_Hinterrupt - .else - bne masked_interrupt - .endif - .endif - - std r11,\area\()+EX_R11(r13) - std r12,\area\()+EX_R12(r13) - - /* - * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], - * because a d-side MCE will clobber those registers so is - * not recoverable if they are live. - */ - GET_SCRATCH0(r10) - std r10,\area\()+EX_R13(r13) - .if \dar - mfspr r10,SPRN_DAR - std r10,\area\()+EX_DAR(r13) - .endif - .if \dsisr - mfspr r10,SPRN_DSISR - stw r10,\area\()+EX_DSISR(r13) - .endif -.endm - -.macro EXCEPTION_PROLOG_2_REAL label, hsrr, set_ri +.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri ld r10,PACAKMSR(r13) /* get MSR value for kernel */ .if ! \set_ri xori r10,r10,MSR_RI /* Clear MSR_RI */ @@ -293,7 +223,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) b . /* prevent speculative execution */ .endm -.macro EXCEPTION_PROLOG_2_VIRT label, hsrr +/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */ +.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr #ifdef CONFIG_RELOCATABLE .if \hsrr == EXC_HV_OR_STD BEGIN_FTR_SECTION @@ -620,7 +551,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * This is done if early=2. */ .macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 - EXCEPTION_PROLOG_0 \area + SET_SCRATCH0(r13) /* save r13 */ + GET_PACA(r13) + std r9,\area\()+EX_R9(r13) /* save r9 */ + OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ + OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) .if \ool .if !\virt b tramp_real_\name @@ -632,16 +569,74 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) TRAMP_VIRT_BEGIN(tramp_virt_\name) .endif .endif - EXCEPTION_PROLOG_1 \hsrr, \area, \kvm, \vec, \dar, \dsisr, \bitmask + + OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) + OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) + INTERRUPT_TO_KERNEL + SAVE_CTR(r10, \area\()) + mfcr r9 + .if \kvm + KVMTEST \hsrr \vec + .endif + .if \bitmask + lbz r10,PACAIRQSOFTMASK(r13) + andi. r10,r10,\bitmask + /* Associate vector numbers with bits in paca->irq_happened */ + .if \vec == 0x500 || \vec == 0xea0 + li r10,PACA_IRQ_EE + .elseif \vec == 0x900 + li r10,PACA_IRQ_DEC + .elseif \vec == 0xa00 || \vec == 0xe80 + li r10,PACA_IRQ_DBELL + .elseif \vec == 0xe60 + li r10,PACA_IRQ_HMI + .elseif \vec == 0xf00 + li r10,PACA_IRQ_PMI + .else + .abort "Bad maskable vector" + .endif + + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + bne masked_Hinterrupt + FTR_SECTION_ELSE + bne masked_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + bne masked_Hinterrupt + .else + bne masked_interrupt + .endif + .endif + + std r11,\area\()+EX_R11(r13) + std r12,\area\()+EX_R12(r13) + + /* + * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], + * because a d-side MCE will clobber those registers so is + * not recoverable if they are live. + */ + GET_SCRATCH0(r10) + std r10,\area\()+EX_R13(r13) + .if \dar + mfspr r10,SPRN_DAR + std r10,\area\()+EX_DAR(r13) + .endif + .if \dsisr + mfspr r10,SPRN_DSISR + stw r10,\area\()+EX_DSISR(r13) + .endif + .if \early == 2 /* nothing more */ .elseif \early mfctr r10 /* save ctr, even for !RELOCATABLE */ BRANCH_TO_C000(r11, \name\()_early_common) .elseif !\virt - EXCEPTION_PROLOG_2_REAL \name\()_common, \hsrr, \ri + INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri .else - EXCEPTION_PROLOG_2_VIRT \name\()_common, \hsrr + INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr .endif .if \ool .popsection @@ -1852,7 +1847,7 @@ EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) bne+ denorm_assist #endif KVMTEST EXC_HV 0x1500 - EXCEPTION_PROLOG_2_REAL denorm_common, EXC_HV, 1 + INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION @@ -1986,7 +1981,7 @@ EXC_VIRT_NONE(0x5800, 0x100) std r12,PACA_EXGEN+EX_R12(r13); \ GET_SCRATCH0(r10); \ std r10,PACA_EXGEN+EX_R13(r13); \ - EXCEPTION_PROLOG_2_REAL soft_nmi_common, _H, 1 + INT_SAVE_SRR_AND_JUMP soft_nmi_common, _H, 1 /* * Branch to soft_nmi_interrupt using the emergency stack. The emergency -- cgit v1.2.3 From 52b989231c6fad42dff57b69a53f38756db48e06 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:49 +1000 Subject: powerpc/64s/exception: Add the virt variant of the denorm interrupt handler All other virt handlers have the prolog code in the virt vector rather than branch to the real vector. Follow this pattern in the denorm virt handler. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-25-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 1ae2a8d59aa0..d2aa63b6a8a8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1852,7 +1852,11 @@ EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) - b exc_real_0x1500_denorm_exception_hv + INT_HANDLER denorm_exception, 0x1500, 0, 2, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0 + mfspr r10,SPRN_HSRR1 + andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ + bne+ denorm_assist + INT_VIRT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV EXC_VIRT_END(denorm_exception, 0x5500, 0x100) #else EXC_VIRT_NONE(0x5500, 0x100) -- cgit v1.2.3 From 4515c5fa41936088a57efe0b64d1bb46a4943582 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:50 +1000 Subject: powerpc/64s/exception: INT_HANDLER support HDAR/HDSISR and use it in HDSI Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-26-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d2aa63b6a8a8..476e4bbf1bf5 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -620,11 +620,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) GET_SCRATCH0(r10) std r10,\area\()+EX_R13(r13) .if \dar + .if \hsrr + mfspr r10,SPRN_HDAR + .else mfspr r10,SPRN_DAR + .endif std r10,\area\()+EX_DAR(r13) .endif .if \dsisr + .if \hsrr + mfspr r10,SPRN_HDSISR + .else mfspr r10,SPRN_DSISR + .endif stw r10,\area\()+EX_DSISR(r13) .endif @@ -1563,17 +1571,13 @@ EXC_COMMON(single_step_common, 0xd00, single_step_exception) EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20) - INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, kvm=1 + INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 EXC_REAL_END(h_data_storage, 0xe00, 0x20) EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) - INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, kvm=1 + INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) EXC_COMMON_BEGIN(h_data_storage_common) - mfspr r10,SPRN_HDAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_HDSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_COMMON(PACA_EXGEN, 0xe00) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) -- cgit v1.2.3 From 141fed2669a93604bd5ce8b793d85f4798626ef5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:51 +1000 Subject: powerpc/64s/exception: Add INT_KVM_HANDLER gas macro Replace the 4 variants of cpp macros with one gas macro. No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-27-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 99 +++++++++++++++--------------------- 1 file changed, 40 insertions(+), 59 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 476e4bbf1bf5..cff48d212011 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -651,22 +651,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) .endif .endm - -#define TRAMP_KVM(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_##n); \ - KVM_HANDLER area, EXC_STD, n, 0 - -#define TRAMP_KVM_SKIP(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_##n); \ - KVM_HANDLER area, EXC_STD, n, 1 - -#define TRAMP_KVM_HV(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_H##n); \ - KVM_HANDLER area, EXC_HV, n, 0 - -#define TRAMP_KVM_HV_SKIP(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_H##n); \ - KVM_HANDLER area, EXC_HV, n, 1 +.macro INT_KVM_HANDLER vec, hsrr, area, skip + .if \hsrr + TRAMP_KVM_BEGIN(do_kvm_H\vec\()) + .else + TRAMP_KVM_BEGIN(do_kvm_\vec\()) + .endif + KVM_HANDLER \area, \hsrr, \vec, \skip +.endm #define EXC_COMMON(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ @@ -827,9 +819,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) * be dangerous anyway. */ EXC_REAL_END(system_reset, 0x100, 0x100) - EXC_VIRT_NONE(0x4100, 0x100) -TRAMP_KVM(PACA_EXNMI, 0x100) +INT_KVM_HANDLER 0x100, EXC_STD, PACA_EXNMI, 0 #ifdef CONFIG_PPC_P7_NAP TRAMP_REAL_BEGIN(system_reset_idle_wake) @@ -923,7 +914,7 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi) INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 #endif -TRAMP_KVM_SKIP(PACA_EXMC, 0x200) +INT_KVM_HANDLER 0x200, EXC_STD, PACA_EXMC, 1 #define MACHINE_CHECK_HANDLER_WINDUP \ /* Clear MSR_RI before setting SRR0 and SRR1. */\ @@ -1162,9 +1153,7 @@ EXC_REAL_END(data_access, 0x300, 0x80) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1 EXC_VIRT_END(data_access, 0x4300, 0x80) - -TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) - +INT_KVM_HANDLER 0x300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON_BEGIN(data_access_common) /* * Here r13 points to the paca, r9 contains the saved CR, @@ -1193,9 +1182,7 @@ EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1 EXC_VIRT_END(data_access_slb, 0x4380, 0x80) - -TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) - +INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1 EXC_COMMON_BEGIN(data_access_slb_common) EXCEPTION_COMMON(PACA_EXSLB, 0x380) ld r4,PACA_EXSLB+EX_DAR(r13) @@ -1228,9 +1215,7 @@ EXC_REAL_END(instruction_access, 0x400, 0x80) EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) INT_HANDLER instruction_access, 0x400, virt=1 EXC_VIRT_END(instruction_access, 0x4400, 0x80) - -TRAMP_KVM(PACA_EXGEN, 0x400) - +INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) EXCEPTION_COMMON(PACA_EXGEN, 0x400) RECONCILE_IRQ_STATE(r10, r11) @@ -1253,8 +1238,7 @@ EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) -TRAMP_KVM(PACA_EXSLB, 0x480) - +INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0 EXC_COMMON_BEGIN(instruction_access_slb_common) EXCEPTION_COMMON(PACA_EXSLB, 0x480) ld r4,_NIP(r1) @@ -1285,9 +1269,8 @@ EXC_REAL_END(hardware_interrupt, 0x500, 0x100) EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) - -TRAMP_KVM(PACA_EXGEN, 0x500) -TRAMP_KVM_HV(PACA_EXGEN, 0x500) +INT_KVM_HANDLER 0x500, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER 0x500, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) @@ -1297,8 +1280,7 @@ EXC_REAL_END(alignment, 0x600, 0x100) EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1 EXC_VIRT_END(alignment, 0x4600, 0x100) - -TRAMP_KVM(PACA_EXGEN, 0x600) +INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(alignment_common) EXCEPTION_COMMON(PACA_EXGEN, 0x600) ld r3,PACA_EXGEN+EX_DAR(r13) @@ -1318,7 +1300,7 @@ EXC_REAL_END(program_check, 0x700, 0x100) EXC_VIRT_BEGIN(program_check, 0x4700, 0x100) INT_HANDLER program_check, 0x700, virt=1 EXC_VIRT_END(program_check, 0x4700, 0x100) -TRAMP_KVM(PACA_EXGEN, 0x700) +INT_KVM_HANDLER 0x700, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(program_check_common) /* * It's possible to receive a TM Bad Thing type program check with @@ -1360,7 +1342,7 @@ EXC_REAL_END(fp_unavailable, 0x800, 0x100) EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) INT_HANDLER fp_unavailable, 0x800, virt=1 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) -TRAMP_KVM(PACA_EXGEN, 0x800) +INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(fp_unavailable_common) EXCEPTION_COMMON(PACA_EXGEN, 0x800) bne 1f /* if from user, just load it up */ @@ -1398,7 +1380,7 @@ EXC_REAL_END(decrementer, 0x900, 0x80) EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(decrementer, 0x4900, 0x80) -TRAMP_KVM(PACA_EXGEN, 0x900) +INT_KVM_HANDLER 0x900, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) @@ -1408,7 +1390,7 @@ EXC_REAL_END(hdecrementer, 0x980, 0x80) EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(hdecrementer, 0x4980, 0x80) -TRAMP_KVM_HV(PACA_EXGEN, 0x980) +INT_KVM_HANDLER 0x980, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) @@ -1418,7 +1400,7 @@ EXC_REAL_END(doorbell_super, 0xa00, 0x100) EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) -TRAMP_KVM(PACA_EXGEN, 0xa00) +INT_KVM_HANDLER 0xa00, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) #else @@ -1566,7 +1548,7 @@ EXC_REAL_END(single_step, 0xd00, 0x100) EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) INT_HANDLER single_step, 0xd00, virt=1 EXC_VIRT_END(single_step, 0x4d00, 0x100) -TRAMP_KVM(PACA_EXGEN, 0xd00) +INT_KVM_HANDLER 0xd00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(single_step_common, 0xd00, single_step_exception) @@ -1576,7 +1558,7 @@ EXC_REAL_END(h_data_storage, 0xe00, 0x20) EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) +INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1 EXC_COMMON_BEGIN(h_data_storage_common) EXCEPTION_COMMON(PACA_EXGEN, 0xe00) bl save_nvgprs @@ -1601,7 +1583,7 @@ EXC_REAL_END(h_instr_storage, 0xe20, 0x20) EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe20) +INT_KVM_HANDLER 0xe20, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) @@ -1611,7 +1593,7 @@ EXC_REAL_END(emulation_assist, 0xe40, 0x20) EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe40) +INT_KVM_HANDLER 0xe40, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) @@ -1624,8 +1606,7 @@ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1 EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe60) - +INT_KVM_HANDLER 0xe60, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_BEGIN(hmi_exception_early_common) mtctr r10 /* Restore ctr */ mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ @@ -1670,7 +1651,7 @@ EXC_REAL_END(h_doorbell, 0xe80, 0x20) EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe80) +INT_KVM_HANDLER 0xe80, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) #else @@ -1684,7 +1665,7 @@ EXC_REAL_END(h_virt_irq, 0xea0, 0x20) EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xea0) +INT_KVM_HANDLER 0xea0, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) @@ -1700,7 +1681,7 @@ EXC_REAL_END(performance_monitor, 0xf00, 0x20) EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) -TRAMP_KVM(PACA_EXGEN, 0xf00) +INT_KVM_HANDLER 0xf00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) @@ -1710,7 +1691,7 @@ EXC_REAL_END(altivec_unavailable, 0xf20, 0x20) EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) -TRAMP_KVM(PACA_EXGEN, 0xf20) +INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(altivec_unavailable_common) EXCEPTION_COMMON(PACA_EXGEN, 0xf20) #ifdef CONFIG_ALTIVEC @@ -1751,7 +1732,7 @@ EXC_REAL_END(vsx_unavailable, 0xf40, 0x20) EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) -TRAMP_KVM(PACA_EXGEN, 0xf40) +INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(vsx_unavailable_common) EXCEPTION_COMMON(PACA_EXGEN, 0xf40) #ifdef CONFIG_VSX @@ -1791,7 +1772,7 @@ EXC_REAL_END(facility_unavailable, 0xf60, 0x20) EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1 EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) -TRAMP_KVM(PACA_EXGEN, 0xf60) +INT_KVM_HANDLER 0xf60, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) @@ -1801,7 +1782,7 @@ EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20) EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xf80) +INT_KVM_HANDLER 0xf80, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) @@ -1822,7 +1803,7 @@ EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200) +INT_KVM_HANDLER 0x1200, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1200, 0x100) @@ -1836,7 +1817,7 @@ EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100) EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) INT_HANDLER instruction_breakpoint, 0x1300, virt=1 EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) -TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300) +INT_KVM_HANDLER 0x1300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) @@ -1866,7 +1847,7 @@ EXC_VIRT_END(denorm_exception, 0x5500, 0x100) EXC_VIRT_NONE(0x5500, 0x100) #endif -TRAMP_KVM_HV(PACA_EXGEN, 0x1500) +INT_KVM_HANDLER 0x1500, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DENORMALISATION TRAMP_REAL_BEGIN(denorm_assist) @@ -1945,7 +1926,7 @@ EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600) +INT_KVM_HANDLER 0x1600, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1600, 0x100) @@ -1959,7 +1940,7 @@ EXC_REAL_END(altivec_assist, 0x1700, 0x100) EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) INT_HANDLER altivec_assist, 0x1700, virt=1 EXC_VIRT_END(altivec_assist, 0x5700, 0x100) -TRAMP_KVM(PACA_EXGEN, 0x1700) +INT_KVM_HANDLER 0x1700, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_ALTIVEC EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) #else @@ -1972,7 +1953,7 @@ EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800) +INT_KVM_HANDLER 0x1800, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1800, 0x100) -- cgit v1.2.3 From 7027d53d1ab17d28b65913148585d5a331446b8b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:52 +1000 Subject: powerpc/64s/exception: KVM_HANDLER reorder arguments to match other macros Also change argument name (n -> vec) to match others. No generated code change. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-28-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index cff48d212011..d37420dee447 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -316,7 +316,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) .endif .endm -.macro KVM_HANDLER area, hsrr, n, skip +.macro KVM_HANDLER vec, hsrr, area, skip .if \skip cmpwi r10,KVM_GUEST_MODE_SKIP beq 89f @@ -337,14 +337,14 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) /* HSRR variants have the 0x2 bit added to their trap number */ .if \hsrr == EXC_HV_OR_STD BEGIN_FTR_SECTION - ori r12,r12,(\n + 0x2) + ori r12,r12,(\vec + 0x2) FTR_SECTION_ELSE - ori r12,r12,(\n) + ori r12,r12,(\vec) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) .elseif \hsrr - ori r12,r12,(\n + 0x2) + ori r12,r12,(\vec + 0x2) .else - ori r12,r12,(\n) + ori r12,r12,(\vec) .endif #ifdef CONFIG_RELOCATABLE @@ -386,7 +386,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) #else .macro KVMTEST hsrr, n .endm -.macro KVM_HANDLER area, hsrr, n, skip +.macro KVM_HANDLER vec, hsrr, area, skip .endm #endif @@ -657,7 +657,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) .else TRAMP_KVM_BEGIN(do_kvm_\vec\()) .endif - KVM_HANDLER \area, \hsrr, \vec, \skip + KVM_HANDLER \vec, \hsrr, \area, \skip .endm #define EXC_COMMON(name, realvec, hdlr) \ @@ -1538,7 +1538,7 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00) SET_SCRATCH0(r10) std r9,PACA_EXGEN+EX_R9(r13) mfcr r9 - KVM_HANDLER PACA_EXGEN, EXC_STD, 0xc00, 0 + KVM_HANDLER 0xc00, EXC_STD, PACA_EXGEN, 0 #endif -- cgit v1.2.3 From 9a9c739aa83d031da7468028de8a65608146eccc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:53 +1000 Subject: powerpc/64s/exception: Merge EXCEPTION_PROLOG_COMMON_2/3 Merge EXCEPTION_PROLOG_COMMON_3 into EXCEPTION_PROLOG_COMMON_2. No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-29-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d37420dee447..0643ae57badc 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -399,7 +399,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r10,GPR1(r1); /* save r1 in stackframe */ \ /* Save original regs values from save area to stack frame. */ -#define EXCEPTION_PROLOG_COMMON_2(area) \ +#define EXCEPTION_PROLOG_COMMON_2(area, trap) \ ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ ld r10,area+EX_R10(r13); \ std r9,GPR9(r1); \ @@ -415,9 +415,7 @@ BEGIN_FTR_SECTION_NESTED(66); \ std r10,ORIG_GPR3(r1); \ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ GET_CTR(r10, area); \ - std r10,_CTR(r1); - -#define EXCEPTION_PROLOG_COMMON_3(trap) \ + std r10,_CTR(r1); \ std r2,GPR2(r1); /* save r2 in stackframe */ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ @@ -453,8 +451,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ SAVE_PPR(area, r9); \ -4: EXCEPTION_PROLOG_COMMON_2(area); \ - EXCEPTION_PROLOG_COMMON_3(trap); \ +4: EXCEPTION_PROLOG_COMMON_2(area, trap); \ ACCOUNT_STOLEN_TIME /* @@ -464,8 +461,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ #define EXCEPTION_COMMON_STACK(area, trap) \ EXCEPTION_PROLOG_COMMON_1(); \ kuap_save_amr_and_lock r9, r10, cr1; \ - EXCEPTION_PROLOG_COMMON_2(area); \ - EXCEPTION_PROLOG_COMMON_3(trap) + EXCEPTION_PROLOG_COMMON_2(area, trap) /* * Restore all registers including H/SRR0/1 saved in a stack frame of a @@ -968,8 +964,7 @@ EXC_COMMON_BEGIN(machine_check_early_common) EXCEPTION_PROLOG_COMMON_1() /* We don't touch AMR here, we never go to virtual mode */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) - EXCEPTION_PROLOG_COMMON_3(0x200) + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC, 0x200) ld r3,PACA_EXMC+EX_DAR(r13) lwz r4,PACA_EXMC+EX_DSISR(r13) @@ -1616,8 +1611,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ EXCEPTION_PROLOG_COMMON_1() /* We don't touch AMR here, we never go to virtual mode */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) - EXCEPTION_PROLOG_COMMON_3(0xe60) + EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN, 0xe60) addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode cmpdi cr0,r3,0 -- cgit v1.2.3 From bcbceed40a8c355b48678d90c5c407dfca811f0e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:54 +1000 Subject: powerpc/64s/exception: Add INT_COMMON gas macro to generate common exception code No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-30-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 52 ++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 0643ae57badc..7829a6ad99aa 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -463,6 +463,18 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ kuap_save_amr_and_lock r9, r10, cr1; \ EXCEPTION_PROLOG_COMMON_2(area, trap) +.macro INT_COMMON vec, area, stack, kaup + .if \stack + EXCEPTION_COMMON(\area, \vec) + .else + EXCEPTION_PROLOG_COMMON_1() + .if \kaup + kuap_save_amr_and_lock r9, r10, cr1 + .endif + EXCEPTION_PROLOG_COMMON_2(\area, \vec) + .endif +.endm + /* * Restore all registers including H/SRR0/1 saved in a stack frame of a * standard exception. @@ -658,7 +670,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define EXC_COMMON(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - EXCEPTION_COMMON(PACA_EXGEN, realvec); \ + INT_COMMON realvec, PACA_EXGEN, 1, 1 ; \ bl save_nvgprs; \ RECONCILE_IRQ_STATE(r10, r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ @@ -671,7 +683,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) */ #define EXC_COMMON_ASYNC(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - EXCEPTION_COMMON(PACA_EXGEN, realvec); \ + INT_COMMON realvec, PACA_EXGEN, 1, 1 ; \ FINISH_NAP; \ RECONCILE_IRQ_STATE(r10, r11); \ RUNLATCH_ON; \ @@ -852,7 +864,7 @@ EXC_COMMON_BEGIN(system_reset_common) mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE - EXCEPTION_COMMON_STACK(PACA_EXNMI, 0x100) + INT_COMMON 0x100, PACA_EXNMI, 0, 1 bl save_nvgprs /* * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does @@ -962,9 +974,8 @@ EXC_COMMON_BEGIN(machine_check_early_common) bgt cr1,unrecoverable_mce /* Check if we hit limit of 4 */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - EXCEPTION_PROLOG_COMMON_1() /* We don't touch AMR here, we never go to virtual mode */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC, 0x200) + INT_COMMON 0x200, PACA_EXMC, 0, 0 ld r3,PACA_EXMC+EX_DAR(r13) lwz r4,PACA_EXMC+EX_DSISR(r13) @@ -1064,7 +1075,7 @@ EXC_COMMON_BEGIN(machine_check_common) * Machine check is different because we use a different * save area: PACA_EXMC instead of PACA_EXGEN. */ - EXCEPTION_COMMON(PACA_EXMC, 0x200) + INT_COMMON 0x200, PACA_EXMC, 1, 1 FINISH_NAP RECONCILE_IRQ_STATE(r10, r11) ld r3,PACA_EXMC+EX_DAR(r13) @@ -1156,7 +1167,7 @@ EXC_COMMON_BEGIN(data_access_common) * r9 - r13 are saved in paca->exgen. * EX_DAR and EX_DSISR have saved DAR/DSISR */ - EXCEPTION_COMMON(PACA_EXGEN, 0x300) + INT_COMMON 0x300, PACA_EXGEN, 1, 1 RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,PACA_EXGEN+EX_DAR(r13) @@ -1179,7 +1190,7 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1 EXC_COMMON_BEGIN(data_access_slb_common) - EXCEPTION_COMMON(PACA_EXSLB, 0x380) + INT_COMMON 0x380, PACA_EXSLB, 1, 1 ld r4,PACA_EXSLB+EX_DAR(r13) std r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD @@ -1212,7 +1223,7 @@ EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) EXC_VIRT_END(instruction_access, 0x4400, 0x80) INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x400) + INT_COMMON 0x400, PACA_EXGEN, 1, 1 RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) @@ -1235,7 +1246,7 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0 EXC_COMMON_BEGIN(instruction_access_slb_common) - EXCEPTION_COMMON(PACA_EXSLB, 0x480) + INT_COMMON 0x480, PACA_EXSLB, 1, 1 ld r4,_NIP(r1) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION @@ -1277,7 +1288,7 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) EXC_VIRT_END(alignment, 0x4600, 0x100) INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(alignment_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x600) + INT_COMMON 0x600, PACA_EXGEN, 1, 1 ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) std r3,_DAR(r1) @@ -1323,7 +1334,7 @@ EXC_COMMON_BEGIN(program_check_common) subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ b 3f /* Jump into the macro !! */ 2: - EXCEPTION_COMMON(PACA_EXGEN, 0x700) + INT_COMMON 0x700, PACA_EXGEN, 1, 1 bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -1339,7 +1350,7 @@ EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(fp_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x800) + INT_COMMON 0x800, PACA_EXGEN, 1, 1 bne 1f /* if from user, just load it up */ bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) @@ -1555,7 +1566,7 @@ EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1 EXC_COMMON_BEGIN(h_data_storage_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xe00) + INT_COMMON 0xe00, PACA_EXGEN, 1, 1 bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -1609,9 +1620,10 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - EXCEPTION_PROLOG_COMMON_1() + /* We don't touch AMR here, we never go to virtual mode */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN, 0xe60) + INT_COMMON 0xe60, PACA_EXGEN, 0, 0 + addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode cmpdi cr0,r3,0 @@ -1629,7 +1641,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_COMMON_BEGIN(hmi_exception_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xe60) + INT_COMMON 0xe60, PACA_EXGEN, 1, 1 FINISH_NAP bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) @@ -1687,7 +1699,7 @@ EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(altivec_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xf20) + INT_COMMON 0xf20, PACA_EXGEN, 1, 1 #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION beq 1f @@ -1728,7 +1740,7 @@ EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(vsx_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xf40) + INT_COMMON 0xf40, PACA_EXGEN, 1, 1 #ifdef CONFIG_VSX BEGIN_FTR_SECTION beq 1f @@ -1979,7 +1991,7 @@ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) subi r1,r1,INT_FRAME_SIZE - EXCEPTION_COMMON_STACK(PACA_EXGEN, 0x900) + INT_COMMON 0x900, PACA_EXGEN, 0, 1 bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD -- cgit v1.2.3 From 5d5e0edfd5fa2a60d0f9b8d6ece1a5ce51aae3b5 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:55 +1000 Subject: powerpc/64s/exception: Expand EXCEPTION_COMMON macro into caller No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-31-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 54 ++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7829a6ad99aa..492786604b10 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -437,41 +437,41 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ * On entry r13 points to the paca, r9-r13 are saved in the paca, * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and * SRR1, and relocation is on. + * + * If stack=0, then the stack is already set in r1, and r1 is saved in r10. + * PPR save and CPU accounting is not done for the !stack case (XXX why not?) */ -#define EXCEPTION_COMMON(area, trap) \ - andi. r10,r12,MSR_PR; /* See if coming from user */ \ - mr r10,r1; /* Save r1 */ \ - subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \ - beq- 1f; \ - ld r1,PACAKSAVE(r13); /* kernel stack to use */ \ -1: tdgei r1,-INT_FRAME_SIZE; /* trap if r1 is in userspace */ \ - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; \ -3: EXCEPTION_PROLOG_COMMON_1(); \ - kuap_save_amr_and_lock r9, r10, cr1, cr0; \ - beq 4f; /* if from kernel mode */ \ - ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ - SAVE_PPR(area, r9); \ -4: EXCEPTION_PROLOG_COMMON_2(area, trap); \ - ACCOUNT_STOLEN_TIME - -/* - * Exception where stack is already set in r1, r1 is saved in r10. - * PPR save and CPU accounting is not done (for some reason). - */ -#define EXCEPTION_COMMON_STACK(area, trap) \ - EXCEPTION_PROLOG_COMMON_1(); \ - kuap_save_amr_and_lock r9, r10, cr1; \ - EXCEPTION_PROLOG_COMMON_2(area, trap) - .macro INT_COMMON vec, area, stack, kaup .if \stack - EXCEPTION_COMMON(\area, \vec) - .else + andi. r10,r12,MSR_PR /* See if coming from user */ + mr r10,r1 /* Save r1 */ + subi r1,r1,INT_FRAME_SIZE /* alloc frame on kernel stack */ + beq- 1f + ld r1,PACAKSAVE(r13) /* kernel stack to use */ +1: tdgei r1,-INT_FRAME_SIZE /* trap if r1 is in userspace */ + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 +3: + .endif EXCEPTION_PROLOG_COMMON_1() + + .if \stack + .if \kaup + kuap_save_amr_and_lock r9, r10, cr1, cr0 + .endif + beq 4f /* if from kernel mode */ + ACCOUNT_CPU_USER_ENTRY(r13, r9, r10) + SAVE_PPR(\area, r9) +4: + .else .if \kaup kuap_save_amr_and_lock r9, r10, cr1 .endif + .endif + EXCEPTION_PROLOG_COMMON_2(\area, \vec) + + .if \stack + ACCOUNT_STOLEN_TIME .endif .endm -- cgit v1.2.3 From 8c9fb5d4f3ddf02fb0fa3dec2dffd6b007ac894c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:56 +1000 Subject: powerpc/64s/exception: Expand EXCEPTION_PROLOG_COMMON_1 and 2 into caller No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-32-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 85 +++++++++++++++++------------------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 492786604b10..7a4f215c4c5b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -390,49 +390,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endm #endif -#define EXCEPTION_PROLOG_COMMON_1() \ - std r9,_CCR(r1); /* save CR in stackframe */ \ - std r11,_NIP(r1); /* save SRR0 in stackframe */ \ - std r12,_MSR(r1); /* save SRR1 in stackframe */ \ - std r10,0(r1); /* make stack chain pointer */ \ - std r0,GPR0(r1); /* save r0 in stackframe */ \ - std r10,GPR1(r1); /* save r1 in stackframe */ \ - -/* Save original regs values from save area to stack frame. */ -#define EXCEPTION_PROLOG_COMMON_2(area, trap) \ - ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ - ld r10,area+EX_R10(r13); \ - std r9,GPR9(r1); \ - std r10,GPR10(r1); \ - ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \ - ld r10,area+EX_R12(r13); \ - ld r11,area+EX_R13(r13); \ - std r9,GPR11(r1); \ - std r10,GPR12(r1); \ - std r11,GPR13(r1); \ -BEGIN_FTR_SECTION_NESTED(66); \ - ld r10,area+EX_CFAR(r13); \ - std r10,ORIG_GPR3(r1); \ -END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ - GET_CTR(r10, area); \ - std r10,_CTR(r1); \ - std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - mflr r9; /* Get LR, later save to stack */ \ - ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ - std r9,_LINK(r1); \ - lbz r10,PACAIRQSOFTMASK(r13); \ - mfspr r11,SPRN_XER; /* save XER in stackframe */ \ - std r10,SOFTE(r1); \ - std r11,_XER(r1); \ - li r9,(trap)+1; \ - std r9,_TRAP(r1); /* set trap number */ \ - li r10,0; \ - ld r11,exception_marker@toc(r2); \ - std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ - /* * On entry r13 points to the paca, r9-r13 are saved in the paca, * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and @@ -452,7 +409,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 3: .endif - EXCEPTION_PROLOG_COMMON_1() + + std r9,_CCR(r1) /* save CR in stackframe */ + std r11,_NIP(r1) /* save SRR0 in stackframe */ + std r12,_MSR(r1) /* save SRR1 in stackframe */ + std r10,0(r1) /* make stack chain pointer */ + std r0,GPR0(r1) /* save r0 in stackframe */ + std r10,GPR1(r1) /* save r1 in stackframe */ .if \stack .if \kaup @@ -468,7 +431,39 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ .endif .endif - EXCEPTION_PROLOG_COMMON_2(\area, \vec) + /* Save original regs values from save area to stack frame. */ + ld r9,\area+EX_R9(r13) /* move r9, r10 to stackframe */ + ld r10,\area+EX_R10(r13) + std r9,GPR9(r1) + std r10,GPR10(r1) + ld r9,\area+EX_R11(r13) /* move r11 - r13 to stackframe */ + ld r10,\area+EX_R12(r13) + ld r11,\area+EX_R13(r13) + std r9,GPR11(r1) + std r10,GPR12(r1) + std r11,GPR13(r1) +BEGIN_FTR_SECTION_NESTED(66) + ld r10,\area+EX_CFAR(r13) + std r10,ORIG_GPR3(r1) +END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) + GET_CTR(r10, \area) + std r10,_CTR(r1) + std r2,GPR2(r1) /* save r2 in stackframe */ + SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */ + SAVE_2GPRS(7, r1) /* save r7, r8 in stackframe */ + mflr r9 /* Get LR, later save to stack */ + ld r2,PACATOC(r13) /* get kernel TOC into r2 */ + std r9,_LINK(r1) + lbz r10,PACAIRQSOFTMASK(r13) + mfspr r11,SPRN_XER /* save XER in stackframe */ + std r10,SOFTE(r1) + std r11,_XER(r1) + li r9,(\vec)+1 + std r9,_TRAP(r1) /* set trap number */ + li r10,0 + ld r11,exception_marker@toc(r2) + std r10,RESULT(r1) /* clear regs->result */ + std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */ .if \stack ACCOUNT_STOLEN_TIME -- cgit v1.2.3 From d1a84718888e768296557b83f4c56cb1caef8fdd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:57 +1000 Subject: powerpc/64s/exception: INT_COMMON add DAR, DSISR, reconcile options Move DAR and DSISR saving to pt_regs into INT_COMMON. Also add an option to expand RECONCILE_IRQ_STATE. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-33-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 111 ++++++++++++++++------------------- 1 file changed, 51 insertions(+), 60 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7a4f215c4c5b..c2235876e397 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -398,7 +398,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * If stack=0, then the stack is already set in r1, and r1 is saved in r10. * PPR save and CPU accounting is not done for the !stack case (XXX why not?) */ -.macro INT_COMMON vec, area, stack, kaup +.macro INT_COMMON vec, area, stack, kaup, reconcile, dar, dsisr .if \stack andi. r10,r12,MSR_PR /* See if coming from user */ mr r10,r1 /* Save r1 */ @@ -442,6 +442,24 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r9,GPR11(r1) std r10,GPR12(r1) std r11,GPR13(r1) + .if \dar + .if \dar == 2 + ld r10,_NIP(r1) + .else + ld r10,\area+EX_DAR(r13) + .endif + std r10,_DAR(r1) + .endif + .if \dsisr + .if \dsisr == 2 + ld r10,_MSR(r1) + lis r11,DSISR_SRR1_MATCH_64S@h + and r10,r10,r11 + .else + lwz r10,\area+EX_DSISR(r13) + .endif + std r10,_DSISR(r1) + .endif BEGIN_FTR_SECTION_NESTED(66) ld r10,\area+EX_CFAR(r13) std r10,ORIG_GPR3(r1) @@ -468,6 +486,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) .if \stack ACCOUNT_STOLEN_TIME .endif + + .if \reconcile + RECONCILE_IRQ_STATE(r10, r11) + .endif .endm /* @@ -665,9 +687,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define EXC_COMMON(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - INT_COMMON realvec, PACA_EXGEN, 1, 1 ; \ + INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ bl save_nvgprs; \ - RECONCILE_IRQ_STATE(r10, r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ b ret_from_except @@ -678,9 +699,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) */ #define EXC_COMMON_ASYNC(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - INT_COMMON realvec, PACA_EXGEN, 1, 1 ; \ + INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ FINISH_NAP; \ - RECONCILE_IRQ_STATE(r10, r11); \ RUNLATCH_ON; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ @@ -859,7 +879,7 @@ EXC_COMMON_BEGIN(system_reset_common) mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE - INT_COMMON 0x100, PACA_EXNMI, 0, 1 + INT_COMMON 0x100, PACA_EXNMI, 0, 1, 0, 0, 0 bl save_nvgprs /* * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does @@ -970,12 +990,7 @@ EXC_COMMON_BEGIN(machine_check_early_common) subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ /* We don't touch AMR here, we never go to virtual mode */ - INT_COMMON 0x200, PACA_EXMC, 0, 0 - - ld r3,PACA_EXMC+EX_DAR(r13) - lwz r4,PACA_EXMC+EX_DSISR(r13) - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x200, PACA_EXMC, 0, 0, 0, 1, 1 BEGIN_FTR_SECTION bl enable_machine_check @@ -1070,16 +1085,11 @@ EXC_COMMON_BEGIN(machine_check_common) * Machine check is different because we use a different * save area: PACA_EXMC instead of PACA_EXGEN. */ - INT_COMMON 0x200, PACA_EXMC, 1, 1 + INT_COMMON 0x200, PACA_EXMC, 1, 1, 1, 1, 1 FINISH_NAP - RECONCILE_IRQ_STATE(r10, r11) - ld r3,PACA_EXMC+EX_DAR(r13) - lwz r4,PACA_EXMC+EX_DSISR(r13) /* Enable MSR_RI when finished with PACA_EXMC */ li r10,MSR_RI mtmsrd r10,1 - std r3,_DAR(r1) - std r4,_DSISR(r1) bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception @@ -1162,14 +1172,11 @@ EXC_COMMON_BEGIN(data_access_common) * r9 - r13 are saved in paca->exgen. * EX_DAR and EX_DSISR have saved DAR/DSISR */ - INT_COMMON 0x300, PACA_EXGEN, 1, 1 - RECONCILE_IRQ_STATE(r10, r11) + INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1 ld r12,_MSR(r1) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) + ld r3,_DAR(r1) + ld r4,_DSISR(r1) li r5,0x300 - std r3,_DAR(r1) - std r4,_DSISR(r1) BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE @@ -1185,9 +1192,8 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1 EXC_COMMON_BEGIN(data_access_slb_common) - INT_COMMON 0x380, PACA_EXSLB, 1, 1 - ld r4,PACA_EXSLB+EX_DAR(r13) - std r4,_DAR(r1) + INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0 + ld r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ @@ -1218,14 +1224,11 @@ EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) EXC_VIRT_END(instruction_access, 0x4400, 0x80) INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) - INT_COMMON 0x400, PACA_EXGEN, 1, 1 - RECONCILE_IRQ_STATE(r10, r11) - ld r12,_MSR(r1) - ld r3,_NIP(r1) - andis. r4,r12,DSISR_SRR1_MATCH_64S@h + INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2 + ld r12,_MSR(r1) + ld r3,_DAR(r1) + ld r4,_DSISR(r1) li r5,0x400 - std r3,_DAR(r1) - std r4,_DSISR(r1) BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE @@ -1241,8 +1244,8 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0 EXC_COMMON_BEGIN(instruction_access_slb_common) - INT_COMMON 0x480, PACA_EXSLB, 1, 1 - ld r4,_NIP(r1) + INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0 + ld r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ @@ -1258,7 +1261,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) - ld r4,_NIP(r1) + ld r4,_DAR(r1) ld r5,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_bad_slb_fault @@ -1283,13 +1286,8 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) EXC_VIRT_END(alignment, 0x4600, 0x100) INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(alignment_common) - INT_COMMON 0x600, PACA_EXGEN, 1, 1 - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl alignment_exception b ret_from_except @@ -1329,9 +1327,8 @@ EXC_COMMON_BEGIN(program_check_common) subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ b 3f /* Jump into the macro !! */ 2: - INT_COMMON 0x700, PACA_EXGEN, 1, 1 + INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception b ret_from_except @@ -1345,7 +1342,7 @@ EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(fp_unavailable_common) - INT_COMMON 0x800, PACA_EXGEN, 1, 1 + INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0 bne 1f /* if from user, just load it up */ bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) @@ -1561,15 +1558,11 @@ EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1 EXC_COMMON_BEGIN(h_data_storage_common) - INT_COMMON 0xe00, PACA_EXGEN, 1, 1 + INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION - ld r4,PACA_EXGEN+EX_DAR(r13) - lwz r5,PACA_EXGEN+EX_DSISR(r13) - std r4,_DAR(r1) - std r5,_DSISR(r1) + ld r4,_DAR(r1) li r5,SIGSEGV bl bad_page_fault MMU_FTR_SECTION_ELSE @@ -1617,7 +1610,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ /* We don't touch AMR here, we never go to virtual mode */ - INT_COMMON 0xe60, PACA_EXGEN, 0, 0 + INT_COMMON 0xe60, PACA_EXGEN, 0, 0, 0, 0, 0 addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode @@ -1636,11 +1629,10 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_COMMON_BEGIN(hmi_exception_common) - INT_COMMON 0xe60, PACA_EXGEN, 1, 1 + INT_COMMON 0xe60, PACA_EXGEN, 1, 1, 1, 0, 0 FINISH_NAP - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) RUNLATCH_ON + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl handle_hmi_exception b ret_from_except @@ -1694,7 +1686,7 @@ EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(altivec_unavailable_common) - INT_COMMON 0xf20, PACA_EXGEN, 1, 1 + INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION beq 1f @@ -1735,7 +1727,7 @@ EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(vsx_unavailable_common) - INT_COMMON 0xf40, PACA_EXGEN, 1, 1 + INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_VSX BEGIN_FTR_SECTION beq 1f @@ -1986,9 +1978,8 @@ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) subi r1,r1,INT_FRAME_SIZE - INT_COMMON 0x900, PACA_EXGEN, 0, 1 + INT_COMMON 0x900, PACA_EXGEN, 0, 1, 1, 0, 0 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl soft_nmi_interrupt b ret_from_except -- cgit v1.2.3 From c7c5cbb42d6e207a059c64740b1654376619345e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:58 +1000 Subject: powerpc/64s/exception: move interrupt entry code above the common handler This better reflects the order in which the code is executed. No generated code change except BUG line number constants. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-34-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 450 +++++++++++++++++------------------ 1 file changed, 225 insertions(+), 225 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index c2235876e397..aabd84e83615 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -180,101 +180,6 @@ BEGIN_FTR_SECTION_NESTED(943) \ std ra,offset(r13); \ END_FTR_SECTION_NESTED(ftr,ftr,943) -.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri - ld r10,PACAKMSR(r13) /* get MSR value for kernel */ - .if ! \set_ri - xori r10,r10,MSR_RI /* Clear MSR_RI */ - .endif - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - .endif - LOAD_HANDLER(r10, \label\()) - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - FTR_SECTION_ELSE - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - .else - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - .endif - b . /* prevent speculative execution */ -.endm - -/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */ -.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr -#ifdef CONFIG_RELOCATABLE - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - .endif - LOAD_HANDLER(r12, \label\()) - mtctr r12 - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - FTR_SECTION_ELSE - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - bctr -#else - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - b \label -#endif -.endm - /* * Branch to label using its 0xC000 address. This results in instruction * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned @@ -288,6 +193,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg; \ bctr +.macro INT_KVM_HANDLER vec, hsrr, area, skip + .if \hsrr + TRAMP_KVM_BEGIN(do_kvm_H\vec\()) + .else + TRAMP_KVM_BEGIN(do_kvm_\vec\()) + .endif + KVM_HANDLER \vec, \hsrr, \area, \skip +.endm + #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* @@ -390,6 +304,222 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endm #endif +.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri + ld r10,PACAKMSR(r13) /* get MSR value for kernel */ + .if ! \set_ri + xori r10,r10,MSR_RI /* Clear MSR_RI */ + .endif + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + mtspr SPRN_HSRR1,r10 + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + mtspr SPRN_SRR1,r10 + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + mtspr SPRN_HSRR1,r10 + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + mtspr SPRN_SRR1,r10 + .endif + LOAD_HANDLER(r10, \label\()) + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mtspr SPRN_HSRR0,r10 + HRFI_TO_KERNEL + FTR_SECTION_ELSE + mtspr SPRN_SRR0,r10 + RFI_TO_KERNEL + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mtspr SPRN_HSRR0,r10 + HRFI_TO_KERNEL + .else + mtspr SPRN_SRR0,r10 + RFI_TO_KERNEL + .endif + b . /* prevent speculative execution */ +.endm + +/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */ +.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr +#ifdef CONFIG_RELOCATABLE + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + .endif + LOAD_HANDLER(r12, \label\()) + mtctr r12 + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r12,SPRN_SRR1 /* and HSRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + .else + mfspr r12,SPRN_SRR1 /* and HSRR1 */ + .endif + li r10,MSR_RI + mtmsrd r10,1 /* Set RI (EE=0) */ + bctr +#else + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + .endif + li r10,MSR_RI + mtmsrd r10,1 /* Set RI (EE=0) */ + b \label +#endif +.endm + +/* + * This is the BOOK3S interrupt entry code macro. + * + * This can result in one of several things happening: + * - Branch to the _common handler, relocated, in virtual mode. + * These are normal interrupts (synchronous and asynchronous) handled by + * the kernel. + * - Branch to KVM, relocated but real mode interrupts remain in real mode. + * These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by + * / intended for host or guest kernel, but KVM must always be involved + * because the machine state is set for guest execution. + * - Branch to the masked handler, unrelocated. + * These occur when maskable asynchronous interrupts are taken with the + * irq_soft_mask set. + * - Branch to an "early" handler in real mode but relocated. + * This is done if early=1. MCE and HMI use these to handle errors in real + * mode. + * - Fall through and continue executing in real, unrelocated mode. + * This is done if early=2. + */ +.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 + SET_SCRATCH0(r13) /* save r13 */ + GET_PACA(r13) + std r9,\area\()+EX_R9(r13) /* save r9 */ + OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ + OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) + .if \ool + .if !\virt + b tramp_real_\name + .pushsection .text + TRAMP_REAL_BEGIN(tramp_real_\name) + .else + b tramp_virt_\name + .pushsection .text + TRAMP_VIRT_BEGIN(tramp_virt_\name) + .endif + .endif + + OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) + OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) + INTERRUPT_TO_KERNEL + SAVE_CTR(r10, \area\()) + mfcr r9 + .if \kvm + KVMTEST \hsrr \vec + .endif + .if \bitmask + lbz r10,PACAIRQSOFTMASK(r13) + andi. r10,r10,\bitmask + /* Associate vector numbers with bits in paca->irq_happened */ + .if \vec == 0x500 || \vec == 0xea0 + li r10,PACA_IRQ_EE + .elseif \vec == 0x900 + li r10,PACA_IRQ_DEC + .elseif \vec == 0xa00 || \vec == 0xe80 + li r10,PACA_IRQ_DBELL + .elseif \vec == 0xe60 + li r10,PACA_IRQ_HMI + .elseif \vec == 0xf00 + li r10,PACA_IRQ_PMI + .else + .abort "Bad maskable vector" + .endif + + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + bne masked_Hinterrupt + FTR_SECTION_ELSE + bne masked_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + bne masked_Hinterrupt + .else + bne masked_interrupt + .endif + .endif + + std r11,\area\()+EX_R11(r13) + std r12,\area\()+EX_R12(r13) + + /* + * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], + * because a d-side MCE will clobber those registers so is + * not recoverable if they are live. + */ + GET_SCRATCH0(r10) + std r10,\area\()+EX_R13(r13) + .if \dar + .if \hsrr + mfspr r10,SPRN_HDAR + .else + mfspr r10,SPRN_DAR + .endif + std r10,\area\()+EX_DAR(r13) + .endif + .if \dsisr + .if \hsrr + mfspr r10,SPRN_HDSISR + .else + mfspr r10,SPRN_DSISR + .endif + stw r10,\area\()+EX_DSISR(r13) + .endif + + .if \early == 2 + /* nothing more */ + .elseif \early + mfctr r10 /* save ctr, even for !RELOCATABLE */ + BRANCH_TO_C000(r11, \name\()_early_common) + .elseif !\virt + INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri + .else + INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr + .endif + .if \ool + .popsection + .endif +.endm + /* * On entry r13 points to the paca, r9-r13 are saved in the paca, * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and @@ -555,136 +685,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif -/* - * This is the BOOK3S interrupt entry code macro. - * - * This can result in one of several things happening: - * - Branch to the _common handler, relocated, in virtual mode. - * These are normal interrupts (synchronous and asynchronous) handled by - * the kernel. - * - Branch to KVM, relocated but real mode interrupts remain in real mode. - * These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by - * / intended for host or guest kernel, but KVM must always be involved - * because the machine state is set for guest execution. - * - Branch to the masked handler, unrelocated. - * These occur when maskable asynchronous interrupts are taken with the - * irq_soft_mask set. - * - Branch to an "early" handler in real mode but relocated. - * This is done if early=1. MCE and HMI use these to handle errors in real - * mode. - * - Fall through and continue executing in real, unrelocated mode. - * This is done if early=2. - */ -.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 - SET_SCRATCH0(r13) /* save r13 */ - GET_PACA(r13) - std r9,\area\()+EX_R9(r13) /* save r9 */ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) - HMT_MEDIUM - std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ - OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) - .if \ool - .if !\virt - b tramp_real_\name - .pushsection .text - TRAMP_REAL_BEGIN(tramp_real_\name) - .else - b tramp_virt_\name - .pushsection .text - TRAMP_VIRT_BEGIN(tramp_virt_\name) - .endif - .endif - - OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) - OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) - INTERRUPT_TO_KERNEL - SAVE_CTR(r10, \area\()) - mfcr r9 - .if \kvm - KVMTEST \hsrr \vec - .endif - .if \bitmask - lbz r10,PACAIRQSOFTMASK(r13) - andi. r10,r10,\bitmask - /* Associate vector numbers with bits in paca->irq_happened */ - .if \vec == 0x500 || \vec == 0xea0 - li r10,PACA_IRQ_EE - .elseif \vec == 0x900 - li r10,PACA_IRQ_DEC - .elseif \vec == 0xa00 || \vec == 0xe80 - li r10,PACA_IRQ_DBELL - .elseif \vec == 0xe60 - li r10,PACA_IRQ_HMI - .elseif \vec == 0xf00 - li r10,PACA_IRQ_PMI - .else - .abort "Bad maskable vector" - .endif - - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - bne masked_Hinterrupt - FTR_SECTION_ELSE - bne masked_interrupt - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - bne masked_Hinterrupt - .else - bne masked_interrupt - .endif - .endif - - std r11,\area\()+EX_R11(r13) - std r12,\area\()+EX_R12(r13) - - /* - * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], - * because a d-side MCE will clobber those registers so is - * not recoverable if they are live. - */ - GET_SCRATCH0(r10) - std r10,\area\()+EX_R13(r13) - .if \dar - .if \hsrr - mfspr r10,SPRN_HDAR - .else - mfspr r10,SPRN_DAR - .endif - std r10,\area\()+EX_DAR(r13) - .endif - .if \dsisr - .if \hsrr - mfspr r10,SPRN_HDSISR - .else - mfspr r10,SPRN_DSISR - .endif - stw r10,\area\()+EX_DSISR(r13) - .endif - - .if \early == 2 - /* nothing more */ - .elseif \early - mfctr r10 /* save ctr, even for !RELOCATABLE */ - BRANCH_TO_C000(r11, \name\()_early_common) - .elseif !\virt - INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri - .else - INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr - .endif - .if \ool - .popsection - .endif -.endm - -.macro INT_KVM_HANDLER vec, hsrr, area, skip - .if \hsrr - TRAMP_KVM_BEGIN(do_kvm_H\vec\()) - .else - TRAMP_KVM_BEGIN(do_kvm_\vec\()) - .endif - KVM_HANDLER \vec, \hsrr, \area, \skip -.endm - #define EXC_COMMON(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ -- cgit v1.2.3 From 1b3599829a2560fe6c9a5a4a6bb6b2bc4e5bdaee Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:56:59 +1000 Subject: powerpc/64s/exception: program check handler do not branch into a macro It is clever, but the small code saving is not worth the spaghetti of jumping to a label in an expanded macro, particularly when the label is just a number rather than a descriptive name. So expand the INT_COMMON macro twice, once for the stack and no stack cases, and branch to those. The slight code size increase is worth the improved clarity of branches for this non-performance critical code. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-35-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index aabd84e83615..b0649d56bb15 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -533,11 +533,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) andi. r10,r12,MSR_PR /* See if coming from user */ mr r10,r1 /* Save r1 */ subi r1,r1,INT_FRAME_SIZE /* alloc frame on kernel stack */ - beq- 1f + beq- 100f ld r1,PACAKSAVE(r13) /* kernel stack to use */ -1: tdgei r1,-INT_FRAME_SIZE /* trap if r1 is in userspace */ - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 -3: +100: tdgei r1,-INT_FRAME_SIZE /* trap if r1 is in userspace */ + EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,0 .endif std r9,_CCR(r1) /* save CR in stackframe */ @@ -551,10 +550,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .if \kaup kuap_save_amr_and_lock r9, r10, cr1, cr0 .endif - beq 4f /* if from kernel mode */ + beq 101f /* if from kernel mode */ ACCOUNT_CPU_USER_ENTRY(r13, r9, r10) SAVE_PPR(\area, r9) -4: +101: .else .if \kaup kuap_save_amr_and_lock r9, r10, cr1 @@ -1325,9 +1324,11 @@ EXC_COMMON_BEGIN(program_check_common) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - b 3f /* Jump into the macro !! */ + INT_COMMON 0x700, PACA_EXGEN, 0, 1, 1, 0, 0 + b 3f 2: INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0 +3: bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception -- cgit v1.2.3 From 05f97d94dd0e2883bc7b2e6b7b5e4c088e0d1437 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:57:00 +1000 Subject: powerpc/64s/exception: Remove pointless KVM handler name bifurcation Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-36-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 102 +++++++++++++++-------------------- 1 file changed, 44 insertions(+), 58 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b0649d56bb15..f54a38417904 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -193,12 +193,8 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg; \ bctr -.macro INT_KVM_HANDLER vec, hsrr, area, skip - .if \hsrr - TRAMP_KVM_BEGIN(do_kvm_H\vec\()) - .else - TRAMP_KVM_BEGIN(do_kvm_\vec\()) - .endif +.macro INT_KVM_HANDLER name, vec, hsrr, area, skip + TRAMP_KVM_BEGIN(\name\()_kvm) KVM_HANDLER \vec, \hsrr, \area, \skip .endm @@ -214,20 +210,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define kvmppc_interrupt kvmppc_interrupt_pr #endif -.macro KVMTEST hsrr, n +.macro KVMTEST name, hsrr, n lbz r10,HSTATE_IN_GUEST(r13) cmpwi r10,0 - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - bne do_kvm_H\n - FTR_SECTION_ELSE - bne do_kvm_\n - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - bne do_kvm_H\n - .else - bne do_kvm_\n - .endif + bne \name\()_kvm .endm .macro KVM_HANDLER vec, hsrr, area, skip @@ -298,9 +284,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endm #else -.macro KVMTEST hsrr, n +.macro KVMTEST name, hsrr, n .endm -.macro KVM_HANDLER vec, hsrr, area, skip +.macro KVM_HANDLER name, vec, hsrr, area, skip .endm #endif @@ -445,7 +431,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) SAVE_CTR(r10, \area\()) mfcr r9 .if \kvm - KVMTEST \hsrr \vec + KVMTEST \name \hsrr \vec .endif .if \bitmask lbz r10,PACAIRQSOFTMASK(r13) @@ -842,7 +828,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) */ EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) -INT_KVM_HANDLER 0x100, EXC_STD, PACA_EXNMI, 0 +INT_KVM_HANDLER system_reset 0x100, EXC_STD, PACA_EXNMI, 0 #ifdef CONFIG_PPC_P7_NAP TRAMP_REAL_BEGIN(system_reset_idle_wake) @@ -936,7 +922,7 @@ TRAMP_REAL_BEGIN(machine_check_fwnmi) INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 #endif -INT_KVM_HANDLER 0x200, EXC_STD, PACA_EXMC, 1 +INT_KVM_HANDLER machine_check 0x200, EXC_STD, PACA_EXMC, 1 #define MACHINE_CHECK_HANDLER_WINDUP \ /* Clear MSR_RI before setting SRR0 and SRR1. */\ @@ -1022,8 +1008,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* * Check if we are coming from guest. If yes, then run the normal - * exception handler which will take the do_kvm_200->kvmppc_interrupt - * branch to deliver the MC event to guest. + * exception handler which will take the + * machine_check_kvm->kvmppc_interrupt branch to deliver the MC event + * to guest. */ lbz r11,HSTATE_IN_GUEST(r13) cmpwi r11,0 /* Check if coming from guest */ @@ -1163,7 +1150,7 @@ EXC_REAL_END(data_access, 0x300, 0x80) EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1 EXC_VIRT_END(data_access, 0x4300, 0x80) -INT_KVM_HANDLER 0x300, EXC_STD, PACA_EXGEN, 1 +INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON_BEGIN(data_access_common) /* * Here r13 points to the paca, r9 contains the saved CR, @@ -1189,7 +1176,7 @@ EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1 EXC_VIRT_END(data_access_slb, 0x4380, 0x80) -INT_KVM_HANDLER 0x380, EXC_STD, PACA_EXSLB, 1 +INT_KVM_HANDLER data_access_slb, 0x380, EXC_STD, PACA_EXSLB, 1 EXC_COMMON_BEGIN(data_access_slb_common) INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0 ld r4,_DAR(r1) @@ -1221,7 +1208,7 @@ EXC_REAL_END(instruction_access, 0x400, 0x80) EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) INT_HANDLER instruction_access, 0x400, virt=1 EXC_VIRT_END(instruction_access, 0x4400, 0x80) -INT_KVM_HANDLER 0x400, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2 ld r12,_MSR(r1) @@ -1241,7 +1228,7 @@ EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) -INT_KVM_HANDLER 0x480, EXC_STD, PACA_EXSLB, 0 +INT_KVM_HANDLER instruction_access_slb, 0x480, EXC_STD, PACA_EXSLB, 0 EXC_COMMON_BEGIN(instruction_access_slb_common) INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0 ld r4,_DAR(r1) @@ -1272,8 +1259,7 @@ EXC_REAL_END(hardware_interrupt, 0x500, 0x100) EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) -INT_KVM_HANDLER 0x500, EXC_STD, PACA_EXGEN, 0 -INT_KVM_HANDLER 0x500, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER hardware_interrupt, 0x500, EXC_HV_OR_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) @@ -1283,7 +1269,7 @@ EXC_REAL_END(alignment, 0x600, 0x100) EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1 EXC_VIRT_END(alignment, 0x4600, 0x100) -INT_KVM_HANDLER 0x600, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER alignment, 0x600, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(alignment_common) INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs @@ -1298,7 +1284,7 @@ EXC_REAL_END(program_check, 0x700, 0x100) EXC_VIRT_BEGIN(program_check, 0x4700, 0x100) INT_HANDLER program_check, 0x700, virt=1 EXC_VIRT_END(program_check, 0x4700, 0x100) -INT_KVM_HANDLER 0x700, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER program_check, 0x700, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(program_check_common) /* * It's possible to receive a TM Bad Thing type program check with @@ -1341,7 +1327,7 @@ EXC_REAL_END(fp_unavailable, 0x800, 0x100) EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) INT_HANDLER fp_unavailable, 0x800, virt=1 EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) -INT_KVM_HANDLER 0x800, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER fp_unavailable, 0x800, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(fp_unavailable_common) INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0 bne 1f /* if from user, just load it up */ @@ -1379,7 +1365,7 @@ EXC_REAL_END(decrementer, 0x900, 0x80) EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(decrementer, 0x4900, 0x80) -INT_KVM_HANDLER 0x900, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER decrementer, 0x900, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) @@ -1389,7 +1375,7 @@ EXC_REAL_END(hdecrementer, 0x980, 0x80) EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(hdecrementer, 0x4980, 0x80) -INT_KVM_HANDLER 0x980, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER hdecrementer, 0x980, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) @@ -1399,7 +1385,7 @@ EXC_REAL_END(doorbell_super, 0xa00, 0x100) EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) -INT_KVM_HANDLER 0xa00, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER doorbell_super, 0xa00, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) #else @@ -1458,7 +1444,7 @@ EXC_VIRT_NONE(0x4b00, 0x100) GET_PACA(r13) std r10,PACA_EXGEN+EX_R10(r13) INTERRUPT_TO_KERNEL - KVMTEST EXC_STD 0xc00 /* uses r10, branch to do_kvm_0xc00_system_call */ + KVMTEST system_call EXC_STD 0xc00 /* uses r10, branch to system_call_kvm */ mfctr r9 #else mr r9,r13 @@ -1524,7 +1510,7 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100) * ctr = orig r13 * orig r10 saved in PACA */ -TRAMP_KVM_BEGIN(do_kvm_0xc00) +TRAMP_KVM_BEGIN(system_call_kvm) /* * Save the PPR (on systems that support it) before changing to * HMT_MEDIUM. That allows the KVM code to save that value into the @@ -1547,7 +1533,7 @@ EXC_REAL_END(single_step, 0xd00, 0x100) EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) INT_HANDLER single_step, 0xd00, virt=1 EXC_VIRT_END(single_step, 0x4d00, 0x100) -INT_KVM_HANDLER 0xd00, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER single_step, 0xd00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(single_step_common, 0xd00, single_step_exception) @@ -1557,7 +1543,7 @@ EXC_REAL_END(h_data_storage, 0xe00, 0x20) EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) -INT_KVM_HANDLER 0xe00, EXC_HV, PACA_EXGEN, 1 +INT_KVM_HANDLER h_data_storage, 0xe00, EXC_HV, PACA_EXGEN, 1 EXC_COMMON_BEGIN(h_data_storage_common) INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs @@ -1578,7 +1564,7 @@ EXC_REAL_END(h_instr_storage, 0xe20, 0x20) EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) -INT_KVM_HANDLER 0xe20, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER h_instr_storage, 0xe20, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) @@ -1588,7 +1574,7 @@ EXC_REAL_END(emulation_assist, 0xe40, 0x20) EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) -INT_KVM_HANDLER 0xe40, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER emulation_assist, 0xe40, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) @@ -1601,7 +1587,7 @@ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1 EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) -INT_KVM_HANDLER 0xe60, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER hmi_exception, 0xe60, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_BEGIN(hmi_exception_early_common) mtctr r10 /* Restore ctr */ mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ @@ -1645,7 +1631,7 @@ EXC_REAL_END(h_doorbell, 0xe80, 0x20) EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) -INT_KVM_HANDLER 0xe80, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER h_doorbell, 0xe80, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) #else @@ -1659,7 +1645,7 @@ EXC_REAL_END(h_virt_irq, 0xea0, 0x20) EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) -INT_KVM_HANDLER 0xea0, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER h_virt_irq, 0xea0, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) @@ -1675,7 +1661,7 @@ EXC_REAL_END(performance_monitor, 0xf00, 0x20) EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) -INT_KVM_HANDLER 0xf00, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER performance_monitor, 0xf00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) @@ -1685,7 +1671,7 @@ EXC_REAL_END(altivec_unavailable, 0xf20, 0x20) EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1 EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) -INT_KVM_HANDLER 0xf20, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER altivec_unavailable, 0xf20, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(altivec_unavailable_common) INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_ALTIVEC @@ -1726,7 +1712,7 @@ EXC_REAL_END(vsx_unavailable, 0xf40, 0x20) EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1 EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) -INT_KVM_HANDLER 0xf40, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER vsx_unavailable, 0xf40, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(vsx_unavailable_common) INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_VSX @@ -1766,7 +1752,7 @@ EXC_REAL_END(facility_unavailable, 0xf60, 0x20) EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1 EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) -INT_KVM_HANDLER 0xf60, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER facility_unavailable, 0xf60, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) @@ -1776,7 +1762,7 @@ EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20) EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1 EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) -INT_KVM_HANDLER 0xf80, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER h_facility_unavailable, 0xf80, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) @@ -1797,7 +1783,7 @@ EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) -INT_KVM_HANDLER 0x1200, EXC_HV, PACA_EXGEN, 1 +INT_KVM_HANDLER cbe_system_error, 0x1200, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1200, 0x100) @@ -1811,7 +1797,7 @@ EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100) EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) INT_HANDLER instruction_breakpoint, 0x1300, virt=1 EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) -INT_KVM_HANDLER 0x1300, EXC_STD, PACA_EXGEN, 1 +INT_KVM_HANDLER instruction_breakpoint, 0x1300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) @@ -1825,7 +1811,7 @@ EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ bne+ denorm_assist #endif - KVMTEST EXC_HV 0x1500 + KVMTEST denorm_exception_hv, EXC_HV 0x1500 INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) @@ -1841,7 +1827,7 @@ EXC_VIRT_END(denorm_exception, 0x5500, 0x100) EXC_VIRT_NONE(0x5500, 0x100) #endif -INT_KVM_HANDLER 0x1500, EXC_HV, PACA_EXGEN, 0 +INT_KVM_HANDLER denorm_exception_hv, 0x1500, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DENORMALISATION TRAMP_REAL_BEGIN(denorm_assist) @@ -1920,7 +1906,7 @@ EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) -INT_KVM_HANDLER 0x1600, EXC_HV, PACA_EXGEN, 1 +INT_KVM_HANDLER cbe_maintenance, 0x1600, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1600, 0x100) @@ -1934,7 +1920,7 @@ EXC_REAL_END(altivec_assist, 0x1700, 0x100) EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) INT_HANDLER altivec_assist, 0x1700, virt=1 EXC_VIRT_END(altivec_assist, 0x5700, 0x100) -INT_KVM_HANDLER 0x1700, EXC_STD, PACA_EXGEN, 0 +INT_KVM_HANDLER altivec_assist, 0x1700, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_ALTIVEC EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) #else @@ -1947,7 +1933,7 @@ EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1 EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) -INT_KVM_HANDLER 0x1800, EXC_HV, PACA_EXGEN, 1 +INT_KVM_HANDLER cbe_thermal, 0x1800, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1800, 0x100) -- cgit v1.2.3 From 9b123d1ea23701bc00ebf712f1e03b25b8195eeb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 2 Aug 2019 20:57:01 +1000 Subject: powerpc/64s/exception: reduce page fault unnecessary loads This avoids 3 loads in the radix page fault case, 1 load in the hash fault case, and 2 loads in the hash miss page fault case. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190802105709.27696-37-npiggin@gmail.com --- arch/powerpc/kernel/exceptions-64s.S | 38 ++++++++++++++++------------------- arch/powerpc/mm/book3s64/hash_utils.c | 4 ++-- 2 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index f54a38417904..d0018dd17e0a 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1159,11 +1159,11 @@ EXC_COMMON_BEGIN(data_access_common) * EX_DAR and EX_DSISR have saved DAR/DSISR */ INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1 - ld r12,_MSR(r1) - ld r3,_DAR(r1) - ld r4,_DSISR(r1) - li r5,0x300 + ld r4,_DAR(r1) + ld r5,_DSISR(r1) BEGIN_MMU_FTR_SECTION + ld r6,_MSR(r1) + li r3,0x300 b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault @@ -1211,11 +1211,11 @@ EXC_VIRT_END(instruction_access, 0x4400, 0x80) INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2 - ld r12,_MSR(r1) - ld r3,_DAR(r1) - ld r4,_DSISR(r1) - li r5,0x400 + ld r4,_DAR(r1) + ld r5,_DSISR(r1) BEGIN_MMU_FTR_SECTION + ld r6,_MSR(r1) + li r3,0x400 b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault @@ -2260,7 +2260,7 @@ do_hash_page: #ifdef CONFIG_PPC_BOOK3S_64 lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h ori r0,r0,DSISR_BAD_FAULT_64S@l - and. r0,r4,r0 /* weird error? */ + and. r0,r5,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ ld r11, PACA_THREAD_INFO(r13) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ @@ -2268,15 +2268,13 @@ do_hash_page: bne 77f /* then don't call hash_page now */ /* - * r3 contains the faulting address - * r4 msr - * r5 contains the trap number - * r6 contains dsisr + * r3 contains the trap number + * r4 contains the faulting address + * r5 contains dsisr + * r6 msr * * at return r3 = 0 for success, 1 for page fault, negative for error */ - mr r4,r12 - ld r6,_DSISR(r1) bl __hash_page /* build HPTE if possible */ cmpdi r3,0 /* see if __hash_page succeeded */ @@ -2286,16 +2284,15 @@ do_hash_page: /* Error */ blt- 13f - /* Reload DSISR into r4 for the DABR check below */ - ld r4,_DSISR(r1) + /* Reload DAR/DSISR into r4/r5 for the DABR check below */ + ld r4,_DAR(r1) + ld r5,_DSISR(r1) #endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: -11: andis. r0,r4,DSISR_DABRMATCH@h +11: andis. r0,r5,DSISR_DABRMATCH@h bne- handle_dabr_fault - ld r4,_DAR(r1) - ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_page_fault cmpdi r3,0 @@ -2342,7 +2339,6 @@ handle_dabr_fault: * the access, or panic if there isn't a handler. */ 77: bl save_nvgprs - mr r4,r3 addi r3,r1,STACK_FRAME_OVERHEAD li r5,SIGSEGV bl bad_page_fault diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 4d0bb194ed70..fe99bba39b69 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1462,8 +1462,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap, } EXPORT_SYMBOL_GPL(hash_page); -int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, - unsigned long dsisr) +int __hash_page(unsigned long trap, unsigned long ea, unsigned long dsisr, + unsigned long msr) { unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; -- cgit v1.2.3 From 799abe283e5103d48e079149579b4f167c95ea0e Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:52 +1000 Subject: powerpc/eeh: Clean up EEH PEs after recovery finishes When the last device in an eeh_pe is removed the eeh_pe structure itself (and any empty parents) are freed since they are no longer needed. This results in a crash when a hotplug driver is involved since the following may occur: 1. Device is suprise removed. 2. Driver performs an MMIO, which fails and queues and eeh_event. 3. Hotplug driver receives a hotplug interrupt and removes any pci_devs that were under the slot. 4. pci_dev is torn down and the eeh_pe is freed. 5. The EEH event handler thread processes the eeh_event and crashes since the eeh_pe pointer in the eeh_event structure is no longer valid. Crashing is generally considered poor form. Instead of doing that use the fact PEs are marked as EEH_PE_INVALID to keep them around until the end of the recovery cycle, at which point we can safely prune any empty PEs. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-2-oohall@gmail.com --- arch/powerpc/kernel/eeh_driver.c | 36 ++++++++++++++++++++++++++++++++++-- arch/powerpc/kernel/eeh_event.c | 8 ++++++++ arch/powerpc/kernel/eeh_pe.c | 23 ++++++++++++++++++++++- 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index a31cd32c4ce9..75266156943f 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -734,6 +734,33 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, */ #define MAX_WAIT_FOR_RECOVERY 300 + +/* Walks the PE tree after processing an event to remove any stale PEs. + * + * NB: This needs to be recursive to ensure the leaf PEs get removed + * before their parents do. Although this is possible to do recursively + * we don't since this is easier to read and we need to garantee + * the leaf nodes will be handled first. + */ +static void eeh_pe_cleanup(struct eeh_pe *pe) +{ + struct eeh_pe *child_pe, *tmp; + + list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) + eeh_pe_cleanup(child_pe); + + if (pe->state & EEH_PE_KEEP) + return; + + if (!(pe->state & EEH_PE_INVALID)) + return; + + if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { + list_del(&pe->child); + kfree(pe); + } +} + /** * eeh_handle_normal_event - Handle EEH events on a specific PE * @pe: EEH PE - which should not be used after we return, as it may @@ -772,8 +799,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); - eeh_pe_update_time_stamp(pe); pe->freeze_count++; if (pe->freeze_count > eeh_max_freezes) { @@ -963,6 +988,12 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } } + + /* + * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING + * we don't want to modify the PE tree structure so we do it here. + */ + eeh_pe_cleanup(pe); eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } @@ -1035,6 +1066,7 @@ void eeh_handle_special_event(void) */ if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); eeh_handle_normal_event(pe); } else { pci_lock_rescan_remove(); diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 64cfbe41174b..e36653e5f76b 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -121,6 +121,14 @@ int __eeh_send_failure_event(struct eeh_pe *pe) } event->pe = pe; + /* + * Mark the PE as recovering before inserting it in the queue. + * This prevents the PE from being free()ed by a hotplug driver + * while the PE is sitting in the event queue. + */ + if (pe) + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + /* We may or may not be called in an interrupt context */ spin_lock_irqsave(&eeh_eventlist_lock, flags); list_add(&event->list, &eeh_eventlist); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 1a6254bcf056..177852e39a25 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -470,6 +470,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) int eeh_rmv_from_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; + bool keep, recover; int cnt; pe = eeh_dev_to_pe(edev); @@ -490,10 +491,21 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ while (1) { parent = pe->parent; + + /* PHB PEs should never be removed */ if (pe->type & EEH_PE_PHB) break; - if (!(pe->state & EEH_PE_KEEP)) { + /* + * XXX: KEEP is set while resetting a PE. I don't think it's + * ever set without RECOVERING also being set. I could + * be wrong though so catch that with a WARN. + */ + keep = !!(pe->state & EEH_PE_KEEP); + recover = !!(pe->state & EEH_PE_RECOVERING); + WARN_ON(keep && !recover); + + if (!keep && !recover) { if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { list_del(&pe->child); @@ -502,6 +514,15 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) break; } } else { + /* + * Mark the PE as invalid. At the end of the recovery + * process any invalid PEs will be garbage collected. + * + * We need to delay the free()ing of them since we can + * remove edev's while traversing the PE tree which + * might trigger the removal of a PE and we can't + * deal with that (yet). + */ if (list_empty(&pe->edevs)) { cnt = 0; list_for_each_entry(child, &pe->child_list, child) { -- cgit v1.2.3 From 5ef753ae435a5cea8af5c84a65fc5dd30b773040 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:53 +1000 Subject: powerpc/eeh: Fix race when freeing PDNs When hot-adding devices we rely on the hotplug driver to create pci_dn's for the devices under the hotplug slot. Converse, when hot-removing the driver will remove the pci_dn's that it created. This is a problem because the pci_dev is still live until it's refcount drops to zero. This can happen if the driver is slow to tear down it's internal state. Ideally, the driver would not attempt to perform any config accesses to the device once it's been marked as removed, but sometimes it happens. As a result, we might attempt to access the pci_dn for a device that has been torn down and the kernel may crash as a result. To fix this, don't free the pci_dn unless the corresponding pci_dev has been released. If the pci_dev is still live, then we mark the pci_dn with a flag that indicates the pci_dev's release function should free it. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-3-oohall@gmail.com --- arch/powerpc/include/asm/pci-bridge.h | 1 + arch/powerpc/kernel/pci-hotplug.c | 7 +++++++ arch/powerpc/kernel/pci_dn.c | 21 +++++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 8dad1fdf4bd2..ea6ec65970ef 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -183,6 +183,7 @@ struct iommu_table; struct pci_dn { int flags; #define PCI_DN_FLAG_IOV_VF 0x01 +#define PCI_DN_FLAG_DEAD 0x02 /* Device has been hot-removed */ int busno; /* pci bus number */ int devfn; /* pci device and function number */ diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 0b0cf8168b47..fc62c4bc47b1 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -55,11 +55,18 @@ EXPORT_SYMBOL_GPL(pci_find_bus_by_node); void pcibios_release_device(struct pci_dev *dev) { struct pci_controller *phb = pci_bus_to_host(dev->bus); + struct pci_dn *pdn = pci_get_pdn(dev); eeh_remove_device(dev); if (phb->controller_ops.release_device) phb->controller_ops.release_device(dev); + + /* free()ing the pci_dn has been deferred to us, do it now */ + if (pdn && (pdn->flags & PCI_DN_FLAG_DEAD)) { + pci_dbg(dev, "freeing dead pdn\n"); + kfree(pdn); + } } /** diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index c4c8c237a106..9524009ca1ae 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -323,6 +323,7 @@ void pci_remove_device_node_info(struct device_node *dn) { struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; struct device_node *parent; + struct pci_dev *pdev; #ifdef CONFIG_EEH struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -336,12 +337,28 @@ void pci_remove_device_node_info(struct device_node *dn) WARN_ON(!list_empty(&pdn->child_list)); list_del(&pdn->list); + /* Drop the parent pci_dn's ref to our backing dt node */ parent = of_get_parent(dn); if (parent) of_node_put(parent); - dn->data = NULL; - kfree(pdn); + /* + * At this point we *might* still have a pci_dev that was + * instantiated from this pci_dn. So defer free()ing it until + * the pci_dev's release function is called. + */ + pdev = pci_get_domain_bus_and_slot(pdn->phb->global_number, + pdn->busno, pdn->devfn); + if (pdev) { + /* NB: pdev has a ref to dn */ + pci_dbg(pdev, "marked pdn (from %pOF) as dead\n", dn); + pdn->flags |= PCI_DN_FLAG_DEAD; + } else { + dn->data = NULL; + kfree(pdn); + } + + pci_dev_put(pdev); } EXPORT_SYMBOL_GPL(pci_remove_device_node_info); -- cgit v1.2.3 From 38ddc011478e573c9ab4e3e9bc54cc5bfc542351 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:54 +1000 Subject: powerpc/eeh: Make permanently failed devices non-actionable If a device is torn down by a hotplug slot driver it's marked as removed and marked as permaantly failed. There's no point in trying to recover a permernantly failed device so it should be considered un-actionable. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-4-oohall@gmail.com --- arch/powerpc/kernel/eeh_driver.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 75266156943f..18a69fac4d80 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -96,8 +96,16 @@ static bool eeh_dev_removed(struct eeh_dev *edev) static bool eeh_edev_actionable(struct eeh_dev *edev) { - return (edev->pdev && !eeh_dev_removed(edev) && - !eeh_pe_passed(edev->pe)); + if (!edev->pdev) + return false; + if (edev->pdev->error_state == pci_channel_io_perm_failure) + return false; + if (eeh_dev_removed(edev)) + return false; + if (eeh_pe_passed(edev->pe)) + return false; + + return true; } /** -- cgit v1.2.3 From b104af5a7687060792ca398bb86b033057afce75 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:55 +1000 Subject: powerpc/eeh: Check slot presence state in eeh_handle_normal_event() When a device is surprise removed while undergoing IO we will probably get an EEH PE freeze due to MMIO timeouts and other errors. When a freeze is detected we send a recovery event to the EEH worker thread which will notify drivers, and perform recovery as needed. In the event of a hot-remove we don't want recovery to occur since there isn't a device to recover. The recovery process is fairly long due to the number of wait states (required by PCIe) which causes problems when devices are removed and replaced (e.g. hot swapping of U.2 NVMe drives). To determine if we need to skip the recovery process we can use the get_adapter_state() operation of the hotplug_slot to determine if the slot contains a device or not, and if the slot is empty we can skip recovery entirely. One thing to note is that the slot being EEH frozen does not prevent the hotplug driver from working. We don't have the EEH recovery thread remove any of the devices since it's assumed that the hotplug driver will handle tearing down the slot state. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-5-oohall@gmail.com --- arch/powerpc/kernel/eeh_driver.c | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 18a69fac4d80..52ce7584af43 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -769,6 +770,46 @@ static void eeh_pe_cleanup(struct eeh_pe *pe) } } +/** + * eeh_check_slot_presence - Check if a device is still present in a slot + * @pdev: pci_dev to check + * + * This function may return a false positive if we can't determine the slot's + * presence state. This might happen for for PCIe slots if the PE containing + * the upstream bridge is also frozen, or the bridge is part of the same PE + * as the device. + * + * This shouldn't happen often, but you might see it if you hotplug a PCIe + * switch. + */ +static bool eeh_slot_presence_check(struct pci_dev *pdev) +{ + const struct hotplug_slot_ops *ops; + struct pci_slot *slot; + u8 state; + int rc; + + if (!pdev) + return false; + + if (pdev->error_state == pci_channel_io_perm_failure) + return false; + + slot = pdev->slot; + if (!slot || !slot->hotplug) + return true; + + ops = slot->hotplug->ops; + if (!ops || !ops->get_adapter_status) + return true; + + rc = ops->get_adapter_status(slot->hotplug, &state); + if (rc) + return true; + + return !!state; +} + /** * eeh_handle_normal_event - Handle EEH events on a specific PE * @pe: EEH PE - which should not be used after we return, as it may @@ -799,6 +840,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) enum pci_ers_result result = PCI_ERS_RESULT_NONE; struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; + int devices = 0; bus = eeh_pe_bus_get(pe); if (!bus) { @@ -807,6 +849,23 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } + /* + * When devices are hot-removed we might get an EEH due to + * a driver attempting to touch the MMIO space of a removed + * device. In this case we don't have a device to recover + * so suppress the event if we can't find any present devices. + * + * The hotplug driver should take care of tearing down the + * device itself. + */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + if (eeh_slot_presence_check(edev->pdev)) + devices++; + + if (!devices) + goto out; /* nothing to recover */ + eeh_pe_update_time_stamp(pe); pe->freeze_count++; if (pe->freeze_count > eeh_max_freezes) { @@ -997,6 +1056,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } } +out: /* * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING * we don't want to modify the PE tree structure so we do it here. -- cgit v1.2.3 From 25baf3d81614b0b8ca8958f4d6f111ccaaaad578 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:56 +1000 Subject: powerpc/eeh: Defer printing stack trace Currently we print a stack trace in the event handler to help with debugging EEH issues. In the case of suprise hot-unplug this is unneeded, so we want to prevent printing the stack trace unless we know it's due to an actual device error. To accomplish this, we can save a stack trace at the point of detection and only print it once the EEH recovery handler has determined the freeze was due to an actual error. Since the whole point of this is to prevent spurious EEH output we also move a few prints out of the detection thread, or mark them as pr_debug so anyone interested can get output from the eeh_check_dev_failure() if they want. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-6-oohall@gmail.com --- arch/powerpc/include/asm/eeh.h | 11 +++++++++++ arch/powerpc/kernel/eeh.c | 15 ++++----------- arch/powerpc/kernel/eeh_driver.c | 38 +++++++++++++++++++++++++++++++++++++- arch/powerpc/kernel/eeh_event.c | 26 ++++++++++++-------------- 4 files changed, 64 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index c13119a5e69b..9d0e1694a94d 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -88,6 +88,17 @@ struct eeh_pe { struct list_head child_list; /* List of PEs below this PE */ struct list_head child; /* Memb. child_list/eeh_phb_pe */ struct list_head edevs; /* List of eeh_dev in this PE */ + + /* + * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure + * the stack trace is saved here so we can print it in the recovery + * thread if it turns out to due to a real problem rather than + * a hot-remove. + * + * A max of 64 entries might be overkill, but it also might not be. + */ + unsigned long stack_trace[64]; + int trace_entries; }; #define eeh_pe_for_each_dev(pe, edev, tmp) \ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 7b2755f5c6fd..398def61f8a6 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -420,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) eeh_pe_mark_isolated(phb_pe); eeh_serialize_unlock(flags); - pr_err("EEH: PHB#%x failure detected, location: %s\n", + pr_debug("EEH: PHB#%x failure detected, location: %s\n", phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); - dump_stack(); eeh_send_failure_event(phb_pe); - return 1; out: eeh_serialize_unlock(flags); @@ -451,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) unsigned long flags; struct device_node *dn; struct pci_dev *dev; - struct eeh_pe *pe, *parent_pe, *phb_pe; + struct eeh_pe *pe, *parent_pe; int rc = 0; const char *location = NULL; @@ -581,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ - phb_pe = eeh_phb_pe_get(pe->phb); - pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", - pe->phb->global_number, pe->addr); - pr_err("EEH: PE location: %s, PHB location: %s\n", - eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); - dump_stack(); - + pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n", + __func__, pe->phb->global_number, pe->addr); eeh_send_failure_event(pe); return 1; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 52ce7584af43..0d34cc12c529 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -863,8 +863,44 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (eeh_slot_presence_check(edev->pdev)) devices++; - if (!devices) + if (!devices) { + pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", + pe->phb->global_number, pe->addr); goto out; /* nothing to recover */ + } + + /* Log the event */ + if (pe->type & EEH_PE_PHB) { + pr_err("EEH: PHB#%x failure detected, location: %s\n", + pe->phb->global_number, eeh_pe_loc_get(pe)); + } else { + struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); + + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); + } + + /* + * Print the saved stack trace now that we've verified there's + * something to recover. + */ + if (pe->trace_entries) { + void **ptrs = (void **) pe->stack_trace; + int i; + + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + + /* FIXME: Use the same format as dump_stack() */ + pr_err("EEH: Call Trace:\n"); + for (i = 0; i < pe->trace_entries; i++) + pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); + + pe->trace_entries = 0; + } + eeh_pe_update_time_stamp(pe); pe->freeze_count++; diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index e36653e5f76b..1d55486adb0f 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; - struct eeh_pe *pe; while (!kthread_should_stop()) { if (wait_for_completion_interruptible(&eeh_eventlist_event)) @@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy) continue; /* We might have event without binding PE */ - pe = event->pe; - if (pe) { - if (pe->type & EEH_PE_PHB) - pr_info("EEH: Detected error on PHB#%x\n", - pe->phb->global_number); - else - pr_info("EEH: Detected PCI bus error on " - "PHB#%x-PE#%x\n", - pe->phb->global_number, pe->addr); - eeh_handle_normal_event(pe); - } else { + if (event->pe) + eeh_handle_normal_event(event->pe); + else eeh_handle_special_event(); - } kfree(event); } @@ -126,8 +116,16 @@ int __eeh_send_failure_event(struct eeh_pe *pe) * This prevents the PE from being free()ed by a hotplug driver * while the PE is sitting in the event queue. */ - if (pe) + if (pe) { + /* + * Save the current stack trace so we can dump it from the + * event handler thread. + */ + pe->trace_entries = stack_trace_save(pe->stack_trace, + ARRAY_SIZE(pe->stack_trace), 0); + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + } /* We may or may not be called in an interrupt context */ spin_lock_irqsave(&eeh_eventlist_lock, flags); -- cgit v1.2.3 From 50554533358571d0c23a88fa1fe3d0b44c6fc7b5 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:57 +1000 Subject: powerpc/eeh: Remove stale CAPI comment Support for switching CAPI cards into and out of CAPI mode was removed a while ago. Drop the comment since it's no longer relevant. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-7-oohall@gmail.com --- arch/powerpc/platforms/powernv/eeh-powernv.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index e7b867912f24..94e26d56ecd2 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1125,13 +1125,6 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return -EIO; } - /* - * If dealing with the root bus (or the bus underneath the - * root port), we reset the bus underneath the root port. - * - * The cxl driver depends on this behaviour for bi-modal card - * switching. - */ if (pci_is_root_bus(bus) || pci_is_root_bus(bus->parent)) return pnv_eeh_root_reset(hose, option); -- cgit v1.2.3 From 98fd32cde59ed71c2c9a6da4101e85f50c9425f3 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:58 +1000 Subject: powernv/eeh: Use generic code to handle hot resets When we reset PCI devices managed by a hotplug driver the reset may generate spurious hotplug events that cause the PCI device we're resetting to be torn down accidently. This is a problem for EEH (when the driver is EEH aware) since we want to leave the OS PCI device state intact so that the device can be re-set without losing any resources (network, disks, etc) provided by the driver. Generic PCI code provides the pci_bus_error_reset() function to handle resetting a PCI Device (or bus) by using the reset method provided by the hotplug slot driver. We can use this function if the EEH core has requested a hot reset (common case) without tripping over the hotplug driver. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-8-oohall@gmail.com --- arch/powerpc/platforms/powernv/eeh-powernv.c | 38 +++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 94e26d56ecd2..6bc24a47e9ef 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -34,6 +34,7 @@ #include "powernv.h" #include "pci.h" +#include "../../../../drivers/pci/pci.h" static int eeh_event_irq = -EINVAL; @@ -849,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) int aer = edev ? edev->aer_cap : 0; u32 ctrl; - pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n", __func__, pci_domain_nr(dev->bus), dev->bus->number, option); @@ -907,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option) if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL)) return __pnv_eeh_bridge_reset(pdev, option); + pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(pdev->bus), + pdev->bus->number, option); + switch (option) { case EEH_RESET_FUNDAMENTAL: scope = OPAL_RESET_PCI_FUNDAMENTAL; @@ -1125,10 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return -EIO; } - if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) + if (pci_is_root_bus(bus)) return pnv_eeh_root_reset(hose, option); + /* + * For hot resets try use the generic PCI error recovery reset + * functions. These correctly handles the case where the secondary + * bus is behind a hotplug slot and it will use the slot provided + * reset methods to prevent spurious hotplug events during the reset. + * + * Fundemental resets need to be handled internally to EEH since the + * PCI core doesn't really have a concept of a fundemental reset, + * mainly because there's no standard way to generate one. Only a + * few devices require an FRESET so it should be fine. + */ + if (option != EEH_RESET_FUNDAMENTAL) { + /* + * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the + * de-assert step. It's like the OPAL reset API was + * poorly designed or something... + */ + if (option == EEH_RESET_DEACTIVATE) + return 0; + + rc = pci_bus_error_reset(bus->self); + if (!rc) + return 0; + } + + /* otherwise, use the generic bridge reset. this might call into FW */ + if (pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); return pnv_eeh_bridge_reset(bus->self, option); } -- cgit v1.2.3 From 7fd1fe4e4811f52b854647b8ff324745135224a5 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:15:59 +1000 Subject: pci-hotplug/pnv_php: Add a reset_slot() callback When performing EEH recovery of devices in a hotplug slot we need to use the slot driver's ->reset_slot() callback to prevent spurious hotplug events due to spurious DLActive and PresDet change interrupts. Add a reset_slot() callback to pnv_php so we can handle recovery of devices in pnv_php managed slots. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-9-oohall@gmail.com --- drivers/pci/hotplug/pnv_php.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 6758fd7c382e..b0e243dabf77 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -511,6 +511,37 @@ scan: return 0; } +static int pnv_php_reset_slot(struct hotplug_slot *slot, int probe) +{ + struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); + struct pci_dev *bridge = php_slot->pdev; + uint16_t sts; + + /* + * The CAPI folks want pnv_php to drive OpenCAPI slots + * which don't have a bridge. Only claim to support + * reset_slot() if we have a bridge device (for now...) + */ + if (probe) + return !bridge; + + /* mask our interrupt while resetting the bridge */ + if (php_slot->irq > 0) + disable_irq(php_slot->irq); + + pci_bridge_secondary_bus_reset(bridge); + + /* clear any state changes that happened due to the reset */ + pcie_capability_read_word(php_slot->pdev, PCI_EXP_SLTSTA, &sts); + sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC); + pcie_capability_write_word(php_slot->pdev, PCI_EXP_SLTSTA, sts); + + if (php_slot->irq > 0) + enable_irq(php_slot->irq); + + return 0; +} + static int pnv_php_enable_slot(struct hotplug_slot *slot) { struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); @@ -548,6 +579,7 @@ static const struct hotplug_slot_ops php_slot_ops = { .set_attention_status = pnv_php_set_attention_state, .enable_slot = pnv_php_enable_slot, .disable_slot = pnv_php_disable_slot, + .reset_slot = pnv_php_reset_slot, }; static void pnv_php_release(struct pnv_php_slot *php_slot) @@ -721,6 +753,12 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data) pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &sts); sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC); pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, sts); + + pci_dbg(pdev, "PCI slot [%s]: HP int! DLAct: %d, PresDet: %d\n", + php_slot->name, + !!(sts & PCI_EXP_SLTSTA_DLLSC), + !!(sts & PCI_EXP_SLTSTA_PDC)); + if (sts & PCI_EXP_SLTSTA_DLLSC) { pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lsts); added = !!(lsts & PCI_EXP_LNKSTA_DLLLA); @@ -735,6 +773,7 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data) added = !!(presence == OPAL_PCI_SLOT_PRESENT); } else { + pci_dbg(pdev, "PCI slot [%s]: Spurious IRQ?\n", php_slot->name); return IRQ_NONE; } -- cgit v1.2.3 From a839bd87a250068521393ca3804f4a5274e103ff Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:16:00 +1000 Subject: pci-hotplug/pnv_php: Add support for IODA3 Power9 PHBs Currently we check that an IODA2 compatible PHB is upstream of this slot. This is mainly to avoid pnv_php creating slots for the various "virtual PHBs" that we create for NVLink. There's no real need for this restriction so allow it on IODA3. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-10-oohall@gmail.com --- arch/powerpc/platforms/powernv/pci.c | 3 ++- drivers/pci/hotplug/pnv_php.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6104418c9ad5..2825d004dece 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) break; } - if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) { + if (!of_device_is_compatible(parent, "ibm,ioda2-phb") && + !of_device_is_compatible(parent, "ibm,ioda3-phb")) { of_node_put(parent); continue; } diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index b0e243dabf77..6fdf8b74cb0a 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -994,6 +994,9 @@ static int __init pnv_php_init(void) for_each_compatible_node(dn, NULL, "ibm,ioda2-phb") pnv_php_register(dn); + for_each_compatible_node(dn, NULL, "ibm,ioda3-phb") + pnv_php_register(dn); + return 0; } @@ -1003,6 +1006,9 @@ static void __exit pnv_php_exit(void) for_each_compatible_node(dn, NULL, "ibm,ioda2-phb") pnv_php_unregister(dn); + + for_each_compatible_node(dn, NULL, "ibm,ioda3-phb") + pnv_php_unregister(dn); } module_init(pnv_php_init); -- cgit v1.2.3 From 018c49e999ac4680e59de236a4a8cde209e8cc98 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:16:01 +1000 Subject: pci-hotplug/pnv_php: Add attention indicator support pnv_php is generally used with PCIe bridges which provide a native interface for setting the attention and power indicator LEDs. Wire up those interfaces even if firmware does not have support for them (yet...) Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-11-oohall@gmail.com --- drivers/pci/hotplug/pnv_php.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index 6fdf8b74cb0a..d7b2b47bc33e 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -419,9 +419,21 @@ static int pnv_php_get_attention_state(struct hotplug_slot *slot, u8 *state) static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state) { struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); + struct pci_dev *bridge = php_slot->pdev; + u16 new, mask; - /* FIXME: Make it real once firmware supports it */ php_slot->attention_state = state; + if (!bridge) + return 0; + + mask = PCI_EXP_SLTCTL_AIC; + + if (state) + new = PCI_EXP_SLTCTL_ATTN_IND_ON; + else + new = PCI_EXP_SLTCTL_ATTN_IND_OFF; + + pcie_capability_clear_and_set_word(bridge, PCI_EXP_SLTCTL, mask, new); return 0; } -- cgit v1.2.3 From aeff27c121ba7397c21a47c749e2b5be07f48c17 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:16:02 +1000 Subject: powerpc/eeh: Set attention indicator while recovering I am the RAS team. Hear me roar. Roar. On a more serious note, being able to locate failed devices can be helpful. Set the attention indicator if the slot supports it once we've determined the device is present and only clear it if the device is fully recovered. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-12-oohall@gmail.com --- arch/powerpc/kernel/eeh_driver.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 0d34cc12c529..80bd157fcb45 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -803,6 +803,10 @@ static bool eeh_slot_presence_check(struct pci_dev *pdev) if (!ops || !ops->get_adapter_status) return true; + /* set the attention indicator while we've got the slot ops */ + if (ops->set_attention_status) + ops->set_attention_status(slot->hotplug, 1); + rc = ops->get_adapter_status(slot->hotplug, &state); if (rc) return true; @@ -810,6 +814,28 @@ static bool eeh_slot_presence_check(struct pci_dev *pdev) return !!state; } +static void eeh_clear_slot_attention(struct pci_dev *pdev) +{ + const struct hotplug_slot_ops *ops; + struct pci_slot *slot; + + if (!pdev) + return; + + if (pdev->error_state == pci_channel_io_perm_failure) + return; + + slot = pdev->slot; + if (!slot || !slot->hotplug) + return; + + ops = slot->hotplug->ops; + if (!ops || !ops->set_attention_status) + return; + + ops->set_attention_status(slot->hotplug, 0); +} + /** * eeh_handle_normal_event - Handle EEH events on a specific PE * @pe: EEH PE - which should not be used after we return, as it may @@ -1098,6 +1124,12 @@ out: * we don't want to modify the PE tree structure so we do it here. */ eeh_pe_cleanup(pe); + + /* clear the slot attention LED for all recovered devices */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + eeh_clear_slot_attention(edev->pdev); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } -- cgit v1.2.3 From 22cda7c1680c1ddfe941adae45e7e7ef52d0e411 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:16:03 +1000 Subject: powerpc/eeh: Add debugfs interface to run an EEH check Detecting an frozen EEH PE usually occurs when an MMIO load returns a 0xFFs response. When performing EEH testing using the EEH error injection feature available on some platforms there is no simple way to kick-off the kernel's recovery process since any accesses from userspace (usually /dev/mem) will bypass the MMIO helpers in the kernel which check if a 0xFF response is due to an EEH freeze or not. If a device contains a 0xFF byte in it's config space it's possible to trigger the recovery process via config space read from userspace, but this is not a reliable method. If a driver is bound to the device an in use it will frequently trigger the MMIO check, but this is also inconsistent. To solve these problems this patch adds a debugfs file called "eeh_dev_check" which accepts a ::. string and runs eeh_dev_check_failure() on it. This is the same check that's done when the kernel gets a 0xFF result from an config or MMIO read with the added benifit that it can be reliably triggered from userspace. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-13-oohall@gmail.com --- arch/powerpc/kernel/eeh.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 398def61f8a6..2b3c03215a95 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1871,6 +1871,64 @@ static const struct file_operations eeh_force_recover_fops = { .llseek = no_llseek, .write = eeh_force_recover_write, }; + +static ssize_t eeh_debugfs_dev_usage(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + static const char usage[] = "input format: ::.\n"; + + return simple_read_from_buffer(user_buf, count, ppos, + usage, sizeof(usage) - 1); +} + +static ssize_t eeh_dev_check_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + uint32_t domain, bus, dev, fn; + struct pci_dev *pdev; + struct eeh_dev *edev; + char buf[20]; + int ret; + + ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); + if (!ret) + return -EFAULT; + + ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); + if (ret != 4) { + pr_err("%s: expected 4 args, got %d\n", __func__, ret); + return -EINVAL; + } + + pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); + if (!pdev) + return -ENODEV; + + edev = pci_dev_to_eeh_dev(pdev); + if (!edev) { + pci_err(pdev, "No eeh_dev for this device!\n"); + pci_dev_put(pdev); + return -ENODEV; + } + + ret = eeh_dev_check_failure(edev); + pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n", + domain, bus, dev, fn, ret); + + pci_dev_put(pdev); + + return count; +} + +static const struct file_operations eeh_dev_check_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_dev_check_write, + .read = eeh_debugfs_dev_usage, +}; + #endif static int __init eeh_init_proc(void) @@ -1886,6 +1944,9 @@ static int __init eeh_init_proc(void) debugfs_create_bool("eeh_disable_recovery", 0600, powerpc_debugfs_root, &eeh_debugfs_no_recover); + debugfs_create_file_unsafe("eeh_dev_check", 0600, + powerpc_debugfs_root, NULL, + &eeh_dev_check_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, powerpc_debugfs_root, NULL, &eeh_force_recover_fops); -- cgit v1.2.3 From bd6461cc7b3c4fd12dcba4b0e95dfc612df872fd Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:16:04 +1000 Subject: powerpc/eeh: Add a eeh_dev_break debugfs interface Add an interface to debugfs for generating an EEH event on a given device. This works by disabling memory accesses to and from the device by setting the PCI_COMMAND register (or the VF Memory Space Enable on the parent PF). This is a somewhat portable alternative to using the platform specific error injection mechanisms since those tend to be either hard to use, or straight up broken. For pseries the interfaces also requires the use of /dev/mem which is probably going to go away in a post-LOCKDOWN world (and it's a horrific hack to begin with) so moving to a kernel-provided interface makes sense and provides a sane, cross-platform interface for userspace so we can write more generic testing scripts. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-14-oohall@gmail.com --- arch/powerpc/kernel/eeh.c | 139 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 138 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 2b3c03215a95..0a91dee51245 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1892,7 +1892,8 @@ static ssize_t eeh_dev_check_write(struct file *filp, char buf[20]; int ret; - ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); + memset(buf, 0, sizeof(buf)); + ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); if (!ret) return -EFAULT; @@ -1929,6 +1930,139 @@ static const struct file_operations eeh_dev_check_fops = { .read = eeh_debugfs_dev_usage, }; +static int eeh_debugfs_break_device(struct pci_dev *pdev) +{ + struct resource *bar = NULL; + void __iomem *mapped; + u16 old, bit; + int i, pos; + + /* Do we have an MMIO BAR to disable? */ + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { + struct resource *r = &pdev->resource[i]; + + if (!r->flags || !r->start) + continue; + if (r->flags & IORESOURCE_IO) + continue; + if (r->flags & IORESOURCE_UNSET) + continue; + + bar = r; + break; + } + + if (!bar) { + pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n"); + return -ENXIO; + } + + pci_err(pdev, "Going to break: %pR\n", bar); + + if (pdev->is_virtfn) { +#ifndef CONFIG_IOV + return -ENXIO; +#else + /* + * VFs don't have a per-function COMMAND register, so the best + * we can do is clear the Memory Space Enable bit in the PF's + * SRIOV control reg. + * + * Unfortunately, this requires that we have a PF (i.e doesn't + * work for a passed-through VF) and it has the potential side + * effect of also causing an EEH on every other VF under the + * PF. Oh well. + */ + pdev = pdev->physfn; + if (!pdev) + return -ENXIO; /* passed through VFs have no PF */ + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); + pos += PCI_SRIOV_CTRL; + bit = PCI_SRIOV_CTRL_MSE; +#endif /* !CONFIG_IOV */ + } else { + bit = PCI_COMMAND_MEMORY; + pos = PCI_COMMAND; + } + + /* + * Process here is: + * + * 1. Disable Memory space. + * + * 2. Perform an MMIO to the device. This should result in an error + * (CA / UR) being raised by the device which results in an EEH + * PE freeze. Using the in_8() accessor skips the eeh detection hook + * so the freeze hook so the EEH Detection machinery won't be + * triggered here. This is to match the usual behaviour of EEH + * where the HW will asyncronously freeze a PE and it's up to + * the kernel to notice and deal with it. + * + * 3. Turn Memory space back on. This is more important for VFs + * since recovery will probably fail if we don't. For normal + * the COMMAND register is reset as a part of re-initialising + * the device. + * + * Breaking stuff is the point so who cares if it's racy ;) + */ + pci_read_config_word(pdev, pos, &old); + + mapped = ioremap(bar->start, PAGE_SIZE); + if (!mapped) { + pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar); + return -ENXIO; + } + + pci_write_config_word(pdev, pos, old & ~bit); + in_8(mapped); + pci_write_config_word(pdev, pos, old); + + iounmap(mapped); + + return 0; +} + +static ssize_t eeh_dev_break_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + uint32_t domain, bus, dev, fn; + struct pci_dev *pdev; + char buf[20]; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); + if (!ret) + return -EFAULT; + + ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); + if (ret != 4) { + pr_err("%s: expected 4 args, got %d\n", __func__, ret); + return -EINVAL; + } + + pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); + if (!pdev) + return -ENODEV; + + ret = eeh_debugfs_break_device(pdev); + pci_dev_put(pdev); + + if (ret < 0) + return ret; + + return count; +} + +static const struct file_operations eeh_dev_break_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_dev_break_write, + .read = eeh_debugfs_dev_usage, +}; + #endif static int __init eeh_init_proc(void) @@ -1947,6 +2081,9 @@ static int __init eeh_init_proc(void) debugfs_create_file_unsafe("eeh_dev_check", 0600, powerpc_debugfs_root, NULL, &eeh_dev_check_fops); + debugfs_create_file_unsafe("eeh_dev_break", 0600, + powerpc_debugfs_root, NULL, + &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, powerpc_debugfs_root, NULL, &eeh_force_recover_fops); -- cgit v1.2.3 From 85d86c8aa52eb5b3539eebe3adcc2f077118b412 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 3 Sep 2019 20:16:05 +1000 Subject: selftests/powerpc: Add basic EEH selftest Use the new eeh_dev_check and eeh_dev_break interfaces to test EEH recovery. Historically this has been done manually using platform specific EEH error injection facilities (e.g. via RTAS). However, documentation on how to use these facilities is haphazard at best and non-existent at worst so it's hard to develop a cross-platform test. The new debugfs interfaces allow the kernel to handle the platform specific details so we can write a more generic set of sets. This patch adds the most basic of recovery tests where: a) Errors are injected and recovered from sequentially, b) Errors are not injected into PCI-PCI bridges, such as PCIe switches. c) Errors are only injected into device function zero. d) No errors are injected into Virtual Functions. a), b) and c) are largely due to limitations of Linux's EEH support. EEH recovery is serialised in the EEH recovery thread which forces a). Similarly, multi-function PCI devices are almost always grouped into the same PE so injecting an error on one function exercises the same code paths. c) is because we currently more or less ignore PCI bridges during recovery and assume that the recovered topology will be the same as the original. d) is due to the limits of the eeh_dev_break interface. With the current implementation we can't inject an error into a specific VF without potentially causing additional errors on other VFs. Due to the serialised recovery process we might end up timing out waiting for another function to recover before the function of interest is recovered. The platform specific error injection facilities are finer-grained and allow this capability, but doing that requires working out how to use those facilities first. Basicly, it's better than nothing and it's a base to build on. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903101605.2890-15-oohall@gmail.com --- tools/testing/selftests/powerpc/Makefile | 1 + tools/testing/selftests/powerpc/eeh/Makefile | 9 +++ tools/testing/selftests/powerpc/eeh/eeh-basic.sh | 82 ++++++++++++++++++++++ .../testing/selftests/powerpc/eeh/eeh-functions.sh | 76 ++++++++++++++++++++ 4 files changed, 168 insertions(+) create mode 100644 tools/testing/selftests/powerpc/eeh/Makefile create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-basic.sh create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-functions.sh diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index b3ad909aefbc..644770c3b754 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -26,6 +26,7 @@ SUB_DIRS = alignment \ switch_endian \ syscalls \ tm \ + eeh \ vphn \ math \ ptrace \ diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile new file mode 100644 index 000000000000..b397babd569b --- /dev/null +++ b/tools/testing/selftests/powerpc/eeh/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +noarg: + $(MAKE) -C ../ + +TEST_PROGS := eeh-basic.sh +TEST_FILES := eeh-functions.sh + +top_srcdir = ../../../../.. +include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh new file mode 100755 index 000000000000..f988d2f42e8f --- /dev/null +++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh @@ -0,0 +1,82 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only + +. ./eeh-functions.sh + +if ! eeh_supported ; then + echo "EEH not supported on this system, skipping" + exit 0; +fi + +if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \ + [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then + echo "debugfs EEH testing files are missing. Is debugfs mounted?" + exit 1; +fi + +pre_lspci=`mktemp` +lspci > $pre_lspci + +# Bump the max freeze count to something absurd so we don't +# trip over it while breaking things. +echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes + +# record the devices that we break in here. Assuming everything +# goes to plan we should get them back once the recover process +# is finished. +devices="" + +# Build up a list of candidate devices. +for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do + # skip bridges since we can't recover them (yet...) + if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then + echo "$dev, Skipped: bridge" + continue; + fi + + # Skip VFs for now since we don't have a reliable way + # to break them. + if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then + echo "$dev, Skipped: virtfn" + continue; + fi + + # Don't inject errosr into an already-frozen PE. This happens with + # PEs that contain multiple PCI devices (e.g. multi-function cards) + # and injecting new errors during the recovery process will probably + # result in the recovery failing and the device being marked as + # failed. + if ! pe_ok $dev ; then + echo "$dev, Skipped: Bad initial PE state" + continue; + fi + + echo "$dev, Added" + + # Add to this list of device to check + devices="$devices $dev" +done + +dev_count="$(echo $devices | wc -w)" +echo "Found ${dev_count} breakable devices..." + +failed=0 +for dev in $devices ; do + echo "Breaking $dev..." + + if ! pe_ok $dev ; then + echo "Skipping $dev, Initial PE state is not ok" + failed="$((failed + 1))" + continue; + fi + + if ! eeh_one_dev $dev ; then + failed="$((failed + 1))" + fi +done + +echo "$failed devices failed to recover ($dev_count tested)" +lspci | diff -u $pre_lspci - +rm -f $pre_lspci + +exit $failed diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh new file mode 100755 index 000000000000..26112ab5cdf4 --- /dev/null +++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh @@ -0,0 +1,76 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only + +pe_ok() { + local dev="$1" + local path="/sys/bus/pci/devices/$dev/eeh_pe_state" + + if ! [ -e "$path" ] ; then + return 1; + fi + + local fw_state="$(cut -d' ' -f1 < $path)" + local sw_state="$(cut -d' ' -f2 < $path)" + + # If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an + # error state or being recovered. Either way, not ok. + if [ "$((sw_state & 0x3))" -ne 0 ] ; then + return 1 + fi + + # A functioning PE should have the EEH_STATE_MMIO_ACTIVE and + # EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason + # the platform backends set these when the PE is in reset. The + # RECOVERING check above should stop any false positives though. + if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then + return 1 + fi + + return 0; +} + +eeh_supported() { + test -e /proc/powerpc/eeh && \ + grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh +} + +eeh_one_dev() { + local dev="$1" + + # Using this function from the command line is sometimes useful for + # testing so check that the argument is a well-formed sysfs device + # name. + if ! test -e /sys/bus/pci/devices/$dev/ ; then + echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)" + return 1; + fi + + # Break it + echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break + + # Force an EEH device check. If the kernel has already + # noticed the EEH (due to a driver poll or whatever), this + # is a no-op. + echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check + + # Enforce a 30s timeout for recovery. Even the IPR, which is infamously + # slow to reset, should recover within 30s. + max_wait=30 + + for i in `seq 0 ${max_wait}` ; do + if pe_ok $dev ; then + break; + fi + echo "$dev, waited $i/${max_wait}" + sleep 1 + done + + if ! pe_ok $dev ; then + echo "$dev, Failed to recover!" + return 1; + fi + + echo "$dev, Recovered after $i seconds" + return 0; +} + -- cgit v1.2.3 From ed6546bdc61b7c4bd926cebd82ba52d056fcefa1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 3 Sep 2019 01:29:26 +1000 Subject: powerpc/64s: remove register_process_table callback This callback is only required because the partition table init comes before process table allocation on powernv (aka bare metal aka native). Change the order to allocate the process table first, and remove the callback. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190902152931.17840-2-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 4 --- arch/powerpc/mm/book3s64/hash_utils.c | 6 ----- arch/powerpc/mm/book3s64/pgtable.c | 3 --- arch/powerpc/mm/book3s64/radix_pgtable.c | 45 +++++++++----------------------- arch/powerpc/platforms/pseries/lpar.c | 17 ++++++++++-- 5 files changed, 27 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 23b83d3593e2..bb3deb76c951 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -206,7 +206,6 @@ extern int mmu_io_psize; void mmu_early_init_devtree(void); void hash__early_init_devtree(void); void radix__early_init_devtree(void); -extern void radix_init_native(void); extern void hash__early_init_mmu(void); extern void radix__early_init_mmu(void); static inline void early_init_mmu(void) @@ -238,9 +237,6 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, first_memblock_size); } -extern int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); - #ifdef CONFIG_PPC_PSERIES extern void radix_init_pseries(void); #else diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index fe99bba39b69..7aed27ea5361 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -859,12 +859,6 @@ static void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; - /* - * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall - * to inform the hypervisor that we wish to use the HPT. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - register_process_table(0, 0, 0); #ifdef CONFIG_FA_DUMP /* * If firmware assisted dump is active firmware preserves diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 206b43ae4000..97f3be778c79 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -23,9 +23,6 @@ EXPORT_SYMBOL(__pmd_frag_nr); unsigned long __pmd_frag_size_shift; EXPORT_SYMBOL(__pmd_frag_size_shift); -int (*register_process_table)(unsigned long base, unsigned long page_size, - unsigned long tbl_size); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is called when relaxing access to a hugepage. It's also called in the page diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 71b649473045..83fa7864e8f4 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -34,19 +34,6 @@ unsigned int mmu_pid_bits; unsigned int mmu_base_pid; -static int native_register_process_table(unsigned long base, unsigned long pg_sz, - unsigned long table_size) -{ - unsigned long patb0, patb1; - - patb0 = be64_to_cpu(partition_tb[0].patb0); - patb1 = base | table_size | PATB_GR; - - mmu_partition_table_set_entry(0, patb0, patb1); - - return 0; -} - static __ref void *early_alloc_pgtable(unsigned long size, int nid, unsigned long region_start, unsigned long region_end) { @@ -381,18 +368,8 @@ static void __init radix_init_pgtable(void) */ rts_field = radix__get_tree_size(); process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); - /* - * Fill in the partition table. We are suppose to use effective address - * of process table here. But our linear mapping also enable us to use - * physical address here. - */ - register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); /* * The init_mm context is given the first available (non-zero) PID, @@ -413,22 +390,24 @@ static void __init radix_init_pgtable(void) static void __init radix_init_partition_table(void) { - unsigned long rts_field, dw0; + unsigned long rts_field, dw0, dw1; mmu_partition_table_init(); rts_field = radix__get_tree_size(); dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; - mmu_partition_table_set_entry(0, dw0, 0); + dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; + mmu_partition_table_set_entry(0, dw0, dw1); + + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); pr_info("Initializing Radix MMU\n"); pr_info("Partition table %p\n", partition_tb); } -void __init radix_init_native(void) -{ - register_process_table = native_register_process_table; -} - static int __init get_idx_from_shift(unsigned int shift) { int idx = -1; @@ -622,8 +601,9 @@ void __init radix__early_init_mmu(void) __pmd_frag_nr = RADIX_PMD_FRAG_NR; __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; + radix_init_pgtable(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) { - radix_init_native(); lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); @@ -634,7 +614,6 @@ void __init radix__early_init_mmu(void) memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - radix_init_pgtable(); /* Switch to the guard PID before turning on MMU */ radix__switch_mmu_context(NULL, &init_mm); if (cpu_has_feature(CPU_FTR_HVMODE)) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 4f76e5f30c97..b3205a6c950c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1531,16 +1531,29 @@ void __init hpte_init_pseries(void) mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; - register_process_table = pseries_lpar_register_process_table; if (firmware_has_feature(FW_FEATURE_HPT_RESIZE)) mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; + + /* + * On POWER9, we need to do a H_REGISTER_PROC_TBL hcall + * to inform the hypervisor that we wish to use the HPT. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pseries_lpar_register_process_table(0, 0, 0); } void radix_init_pseries(void) { pr_info("Using radix MMU under hypervisor\n"); - register_process_table = pseries_lpar_register_process_table; + + pseries_lpar_register_process_table(__pa(process_tb), + 0, PRTB_SIZE_SHIFT - 12); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); } #ifdef CONFIG_PPC_SMLPAR -- cgit v1.2.3 From 99161de3a283af59f2813da6cbdccc1d2784c7de Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 3 Sep 2019 01:29:27 +1000 Subject: powerpc/64s/radix: tidy up TLB flushing code There should be no functional changes. - Use calls to existing radix_tlb.c functions in flush_partition. - Rename radix__flush_tlb_lpid to radix__flush_all_lpid and similar, because they flush everything, matching flush_all_mm rather than flush_tlb_mm for the lpid. - Remove some unused radix_tlb.c flush primitives. Signed-off: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190902152931.17840-3-npiggin@gmail.com --- .../powerpc/include/asm/book3s/64/tlbflush-radix.h | 12 +-- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- arch/powerpc/mm/book3s64/pgtable.c | 13 +-- arch/powerpc/mm/book3s64/radix_tlb.c | 117 +++++---------------- 4 files changed, 34 insertions(+), 110 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 05147cecb8df..4ce795d30377 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -17,8 +17,8 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid, unsigned long addr, unsigned long page_size); extern void radix__flush_pwc_lpid(unsigned int lpid); -extern void radix__flush_tlb_lpid(unsigned int lpid); -extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); +extern void radix__flush_all_lpid(unsigned int lpid); +extern void radix__flush_all_lpid_guest(unsigned int lpid); #else static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }; static inline void radix__flush_tlb_lpid_page(unsigned int lpid, @@ -31,11 +31,7 @@ static inline void radix__flush_pwc_lpid(unsigned int lpid) { WARN_ON(1); } -static inline void radix__flush_tlb_lpid(unsigned int lpid) -{ - WARN_ON(1); -} -static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +static inline void radix__flush_all_lpid(unsigned int lpid) { WARN_ON(1); } @@ -73,6 +69,4 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); extern void radix__flush_tlb_all(void); -extern void radix__local_flush_tlb_lpid(unsigned int lpid); - #endif diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 735e0ac6f5b2..b3316da2f13e 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -398,7 +398,7 @@ static void kvmhv_flush_lpid(unsigned int lpid) long rc; if (!kvmhv_on_pseries()) { - radix__flush_tlb_lpid(lpid); + radix__flush_all_lpid(lpid); return; } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 97f3be778c79..c2b87c5ba50b 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -210,20 +210,17 @@ void __init mmu_partition_table_init(void) static void flush_partition(unsigned int lpid, bool radix) { - asm volatile("ptesync" : : : "memory"); if (radix) { - asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); - trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1); + radix__flush_all_lpid(lpid); + radix__flush_all_lpid_guest(lpid); } else { + asm volatile("ptesync" : : : "memory"); asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + /* do we need fixup here ?*/ + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0); } - /* do we need fixup here ?*/ - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 71f7fede2fa4..082f90d068ee 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -116,22 +116,6 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbiel_lpid(unsigned long lpid, int set, - unsigned long ric) -{ - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ - prs = 0; /* partition scoped */ - r = 1; /* radix format */ - - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); -} - static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -146,23 +130,20 @@ static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric) trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } -static __always_inline void __tlbiel_lpid_guest(unsigned long lpid, int set, - unsigned long ric) +static __always_inline void __tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { unsigned long rb,rs,prs,r; rb = PPC_BIT(52); /* IS = 2 */ - rb |= set << PPC_BITLSHIFT(51); - rs = 0; /* LPID comes from LPIDR */ + rs = lpid; prs = 1; /* process scoped */ r = 1; /* radix format */ - asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - trace_tlbie(lpid, 1, rb, rs, ric, prs, r); + trace_tlbie(lpid, 0, rb, rs, ric, prs, r); } - static __always_inline void __tlbiel_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { @@ -285,34 +266,6 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric) -{ - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. - */ - __tlbiel_lpid(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; - } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST "; isync" : : :"memory"); -} - static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) { asm volatile("ptesync": : :"memory"); @@ -337,35 +290,28 @@ static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } -static __always_inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric) +static __always_inline void _tlbie_lpid_guest(unsigned long lpid, unsigned long ric) { - int set; - - VM_BUG_ON(mfspr(SPRN_LPID) != lpid); - - asm volatile("ptesync": : :"memory"); - /* - * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, - * also flush the entire Page Walk Cache. + * Workaround the fact that the "ric" argument to __tlbie_pid + * must be a compile-time contraint to match the "i" constraint + * in the asm statement. */ - __tlbiel_lpid_guest(lpid, 0, ric); - - /* For PWC, only one flush is needed */ - if (ric == RIC_FLUSH_PWC) { - asm volatile("ptesync": : :"memory"); - return; + switch (ric) { + case RIC_FLUSH_TLB: + __tlbie_lpid_guest(lpid, RIC_FLUSH_TLB); + break; + case RIC_FLUSH_PWC: + __tlbie_lpid_guest(lpid, RIC_FLUSH_PWC); + break; + case RIC_FLUSH_ALL: + default: + __tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } - - /* For the remaining sets, just flush the TLB */ - for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory"); + fixup_tlbie_lpid(lpid); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } - static inline void __tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) @@ -835,32 +781,19 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); /* * Flush partition scoped translations from LPID (=LPIDR) */ -void radix__flush_tlb_lpid(unsigned int lpid) +void radix__flush_all_lpid(unsigned int lpid) { _tlbie_lpid(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); +EXPORT_SYMBOL_GPL(radix__flush_all_lpid); /* - * Flush partition scoped translations from LPID (=LPIDR) + * Flush process scoped translations from LPID (=LPIDR) */ -void radix__local_flush_tlb_lpid(unsigned int lpid) +void radix__flush_all_lpid_guest(unsigned int lpid) { - _tlbiel_lpid(lpid, RIC_FLUSH_ALL); + _tlbie_lpid_guest(lpid, RIC_FLUSH_ALL); } -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid); - -/* - * Flush process scoped translations from LPID (=LPIDR). - * Important difference, the guest normally manages its own translations, - * but some cases e.g., vCPU CPU migration require KVM to flush. - */ -void radix__local_flush_tlb_lpid_guest(unsigned int lpid) -{ - _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL); -} -EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest); - static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); -- cgit v1.2.3 From fd13daea5f72605a0a7386ebedbb5ff2b2a48da4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 3 Sep 2019 01:29:28 +1000 Subject: powerpc/64s: make mmu_partition_table_set_entry TLB flush optional No functional change. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190902152931.17840-4-npiggin@gmail.com --- arch/powerpc/include/asm/mmu.h | 2 +- arch/powerpc/kvm/book3s_hv_nested.c | 2 +- arch/powerpc/mm/book3s64/hash_utils.c | 2 +- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index ba94ce8c22d7..0699cfeeb8c9 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -257,7 +257,7 @@ extern void radix__mmu_cleanup_all(void); /* Functions for creating and updating partition table on POWER9 */ extern void mmu_partition_table_init(void); extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1); + unsigned long dw1, bool flush); #endif /* CONFIG_PPC64 */ struct mm_struct; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index b3316da2f13e..fff90f2c3de2 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -411,7 +411,7 @@ static void kvmhv_flush_lpid(unsigned int lpid) void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1) { if (!kvmhv_on_pseries()) { - mmu_partition_table_set_entry(lpid, dw0, dw1); + mmu_partition_table_set_entry(lpid, dw0, dw1, true); return; } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 7aed27ea5361..b73d08b54d12 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -825,7 +825,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, * For now, UPRT is 0 and we have no segment table. */ htab_size = __ilog2(htab_size) - 18; - mmu_partition_table_set_entry(0, hash_table | htab_size, 0); + mmu_partition_table_set_entry(0, hash_table | htab_size, 0, true); pr_info("Partition table %p\n", partition_tb); } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index c2b87c5ba50b..6fab9c0bbbaf 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -224,7 +224,7 @@ static void flush_partition(unsigned int lpid, bool radix) } void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, - unsigned long dw1) + unsigned long dw1, bool flush) { unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); @@ -251,7 +251,7 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, uv_register_pate(lpid, dw0, dw1); pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n", dw0, dw1); - } else { + } else if (flush) { flush_partition(lpid, (old & PATB_HR)); } } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 83fa7864e8f4..078a7eeec1f5 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -396,7 +396,7 @@ static void __init radix_init_partition_table(void) rts_field = radix__get_tree_size(); dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; - mmu_partition_table_set_entry(0, dw0, dw1); + mmu_partition_table_set_entry(0, dw0, dw1, true); asm volatile("ptesync" : : : "memory"); asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : -- cgit v1.2.3 From 7e71c428a60e2029585be7d7cc22775f442e5b2c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 3 Sep 2019 01:29:29 +1000 Subject: powerpc/64s/pseries: radix flush translations before MMU is enabled at boot Radix guests are responsible for managing their own translation caches, so make them match bare metal radix and hash, and make each CPU flush all its translations right before enabling its MMU. Radix guests may not flush partition scope translations, so in tlbiel_all, make these flushes conditional on CPU_FTR_HVMODE. Process scope translations are the only type visible to the guest. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190902152931.17840-5-npiggin@gmail.com --- arch/powerpc/mm/book3s64/radix_pgtable.c | 6 ++---- arch/powerpc/mm/book3s64/radix_tlb.c | 12 ++++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 078a7eeec1f5..e1e711c4704a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -616,8 +616,7 @@ void __init radix__early_init_mmu(void) /* Switch to the guard PID before turning on MMU */ radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__early_init_mmu_secondary(void) @@ -637,8 +636,7 @@ void radix__early_init_mmu_secondary(void) } radix__switch_mmu_context(NULL, &init_mm); - if (cpu_has_feature(CPU_FTR_HVMODE)) - tlbiel_all(); + tlbiel_all(); } void radix__mmu_cleanup_all(void) diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 082f90d068ee..f9cf8ae59831 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -51,11 +51,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) * and partition table entries. Then flush the remaining sets of the * TLB. */ - tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); - for (set = 1; set < num_sets; set++) - tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); - /* Do the same for process scoped entries. */ + if (early_cpu_has_feature(CPU_FTR_HVMODE)) { + /* MSR[HV] should flush partition scope translations first. */ + tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0); + for (set = 1; set < num_sets; set++) + tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0); + } + + /* Flush process scoped entries. */ tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1); for (set = 1; set < num_sets; set++) tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1); -- cgit v1.2.3 From 7d805accbec57a151bd0dd305a1109feebdfd4a4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 3 Sep 2019 01:29:30 +1000 Subject: powerpc/64s: remove unnecessary translation cache flushes at boot The various translation structure invalidations performed in early boot when the MMU is off are not required, because everything is invalidated immediately before a CPU first enables its MMU (see early_init_mmu and early_init_mmu_secondary). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190902152931.17840-6-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 2 +- arch/powerpc/mm/book3s64/pgtable.c | 5 +++++ arch/powerpc/mm/book3s64/radix_pgtable.c | 8 +------- arch/powerpc/platforms/pseries/lpar.c | 5 ----- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b73d08b54d12..7684a596158b 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -825,7 +825,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, * For now, UPRT is 0 and we have no segment table. */ htab_size = __ilog2(htab_size) - 18; - mmu_partition_table_set_entry(0, hash_table | htab_size, 0, true); + mmu_partition_table_set_entry(0, hash_table | htab_size, 0, false); pr_info("Partition table %p\n", partition_tb); } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 6fab9c0bbbaf..351eb78eed55 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -252,6 +252,11 @@ void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, pr_info("PATE registered by ultravisor: dw0 = 0x%lx, dw1 = 0x%lx\n", dw0, dw1); } else if (flush) { + /* + * Boot does not need to flush, because MMU is off and each + * CPU does a tlbiel_all() before switching them on, which + * flushes everything. + */ flush_partition(lpid, (old & PATB_HR)); } } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e1e711c4704a..0d1107fb34c1 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -396,13 +396,7 @@ static void __init radix_init_partition_table(void) rts_field = radix__get_tree_size(); dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; - mmu_partition_table_set_entry(0, dw0, dw1, true); - - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); + mmu_partition_table_set_entry(0, dw0, dw1, false); pr_info("Initializing Radix MMU\n"); pr_info("Partition table %p\n", partition_tb); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index b3205a6c950c..36b846f6e74e 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1549,11 +1549,6 @@ void radix_init_pseries(void) pseries_lpar_register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (0)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1); } #ifdef CONFIG_PPC_SMLPAR -- cgit v1.2.3 From 2275d7b5754a573ffb2ca9e40bd0546eeb986696 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 3 Sep 2019 01:29:31 +1000 Subject: powerpc/64s/radix: introduce options to disable use of the tlbie instruction Introduce two options to control the use of the tlbie instruction. A boot time option which completely disables the kernel using the instruction, this is currently incompatible with HASH MMU, KVM, and coherent accelerators. And a debugfs option can be switched at runtime and avoids using tlbie for invalidating CPU TLBs for normal process and kernel address mappings. Coherent accelerators are still managed with tlbie, as will KVM partition scope translations. Cross-CPU TLB flushing is implemented with IPIs and tlbiel. This is a basic implementation which does not attempt to make any optimisation beyond the tlbie implementation. This is useful for performance testing among other things. For example in certain situations on large systems, using IPIs may be faster than tlbie as they can be directed rather than broadcast. Later we may also take advantage of the IPIs to do more interesting things such as trim the mm cpumask more aggressively. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190902152931.17840-7-npiggin@gmail.com --- Documentation/admin-guide/kernel-parameters.txt | 4 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 9 ++ arch/powerpc/kvm/book3s_hv.c | 6 + arch/powerpc/mm/book3s64/pgtable.c | 47 ++++++ arch/powerpc/mm/book3s64/radix_tlb.c | 190 +++++++++++++++++++++--- drivers/misc/cxl/main.c | 4 + drivers/misc/ocxl/main.c | 4 + 7 files changed, 246 insertions(+), 18 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4923d8f726e8..3cd757f9feaa 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -860,6 +860,10 @@ disable_radix [PPC] Disable RADIX MMU mode on POWER9 + disable_tlbie [PPC] + Disable TLBIE instruction. Currently does not work + with KVM, with HASH MMU, or with coherent accelerators. + disable_cpu_apicid= [X86,APIC,SMP] Format: The number of initial APIC ID for the diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index ebf572ea621e..7aa8195b6cff 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -162,4 +162,13 @@ static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long addre radix__flush_tlb_pwc(tlb, address); } + +extern bool tlbie_capable; +extern bool tlbie_enabled; + +static inline bool cputlb_use_tlbie(void) +{ + return tlbie_enabled; +} + #endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cde3f5a4b3e4..3cdaa2a09a19 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5462,6 +5462,12 @@ static int kvmppc_radix_possible(void) static int kvmppc_book3s_init_hv(void) { int r; + + if (!tlbie_capable) { + pr_err("KVM-HV: Host does not support TLBIE\n"); + return -ENODEV; + } + /* * FIXME!! Do we need to check on all cpus ? */ diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 351eb78eed55..75483b40fcb1 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -469,3 +470,49 @@ int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, return true; } + +/* + * Does the CPU support tlbie? + */ +bool tlbie_capable __read_mostly = true; +EXPORT_SYMBOL(tlbie_capable); + +/* + * Should tlbie be used for management of CPU TLBs, for kernel and process + * address spaces? tlbie may still be used for nMMU accelerators, and for KVM + * guest address spaces. + */ +bool tlbie_enabled __read_mostly = true; + +static int __init setup_disable_tlbie(char *str) +{ + if (!radix_enabled()) { + pr_err("disable_tlbie: Unable to disable TLBIE with Hash MMU.\n"); + return 1; + } + + tlbie_capable = false; + tlbie_enabled = false; + + return 1; +} +__setup("disable_tlbie", setup_disable_tlbie); + +static int __init pgtable_debugfs_setup(void) +{ + if (!tlbie_capable) + return 0; + + /* + * There is no locking vs tlb flushing when changing this value. + * The tlb flushers will see one value or another, and use either + * tlbie or tlbiel with IPIs. In both cases the TLBs will be + * invalidated as expected. + */ + debugfs_create_bool("tlbie_enabled", 0600, + powerpc_debugfs_root, + &tlbie_enabled); + + return 0; +} +arch_initcall(pgtable_debugfs_setup); diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index f9cf8ae59831..631be42abd33 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -270,6 +270,39 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +struct tlbiel_pid { + unsigned long pid; + unsigned long ric; +}; + +static void do_tlbiel_pid(void *info) +{ + struct tlbiel_pid *t = info; + + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_pid(t->pid, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_pid(t->pid, RIC_FLUSH_PWC); + else + _tlbiel_pid(t->pid, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_pid_multicast(struct mm_struct *mm, + unsigned long pid, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_pid t = { .pid = pid, .ric = ric }; + + on_each_cpu_mask(cpus, do_tlbiel_pid, &t, 1); + /* + * Always want the CPU translations to be invalidated with tlbiel in + * these paths, so while coprocessors must use tlbie, we can not + * optimise away the tlbiel component. + */ + if (atomic_read(&mm->context.copros) > 0) + _tlbie_pid(pid, RIC_FLUSH_ALL); +} + static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric) { asm volatile("ptesync": : :"memory"); @@ -370,6 +403,53 @@ static __always_inline void _tlbie_va(unsigned long va, unsigned long pid, asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +struct tlbiel_va { + unsigned long pid; + unsigned long va; + unsigned long psize; + unsigned long ric; +}; + +static void do_tlbiel_va(void *info) +{ + struct tlbiel_va *t = info; + + if (t->ric == RIC_FLUSH_TLB) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_TLB); + else if (t->ric == RIC_FLUSH_PWC) + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_PWC); + else + _tlbiel_va(t->va, t->pid, t->psize, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_va_multicast(struct mm_struct *mm, + unsigned long va, unsigned long pid, + unsigned long psize, unsigned long ric) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va t = { .va = va, .pid = pid, .psize = psize, .ric = ric }; + on_each_cpu_mask(cpus, do_tlbiel_va, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va(va, pid, psize, RIC_FLUSH_TLB); +} + +struct tlbiel_va_range { + unsigned long pid; + unsigned long start; + unsigned long end; + unsigned long page_size; + unsigned long psize; + bool also_pwc; +}; + +static void do_tlbiel_va_range(void *info) +{ + struct tlbiel_va_range *t = info; + + _tlbiel_va_range(t->start, t->end, t->pid, t->page_size, + t->psize, t->also_pwc); +} + static __always_inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid, unsigned long psize, unsigned long ric) { @@ -393,6 +473,21 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbiel_va_range_multicast(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize, bool also_pwc) +{ + struct cpumask *cpus = mm_cpumask(mm); + struct tlbiel_va_range t = { .start = start, .end = end, + .pid = pid, .page_size = page_size, + .psize = psize, .also_pwc = also_pwc }; + + on_each_cpu_mask(cpus, do_tlbiel_va_range, &t, 1); + if (atomic_read(&mm->context.copros) > 0) + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); +} + /* * Base TLB flushing operations: * @@ -530,10 +625,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm) goto local; } - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } else { local: _tlbiel_pid(pid, RIC_FLUSH_TLB); @@ -559,7 +658,10 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) goto local; } } - _tlbie_pid(pid, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } else { local: _tlbiel_pid(pid, RIC_FLUSH_ALL); @@ -594,7 +696,10 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); + else + _tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB); } else { local: _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); @@ -616,6 +721,24 @@ EXPORT_SYMBOL(radix__flush_tlb_page); #define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ +static void do_tlbiel_kernel(void *info) +{ + _tlbiel_pid(0, RIC_FLUSH_ALL); +} + +static inline void _tlbiel_kernel_broadcast(void) +{ + on_each_cpu(do_tlbiel_kernel, NULL, 1); + if (tlbie_capable) { + /* + * Coherent accelerators don't refcount kernel memory mappings, + * so have to always issue a tlbie for them. This is quite a + * slow path anyway. + */ + _tlbie_pid(0, RIC_FLUSH_ALL); + } +} + /* * If kernel TLBIs ever become local rather than global, then * drivers/misc/ocxl/link.c:ocxl_link_add_pe will need some work, as it @@ -623,7 +746,10 @@ EXPORT_SYMBOL(radix__flush_tlb_page); */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) { - _tlbie_pid(0, RIC_FLUSH_ALL); + if (cputlb_use_tlbie()) + _tlbie_pid(0, RIC_FLUSH_ALL); + else + _tlbiel_kernel_broadcast(); } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); @@ -679,10 +805,14 @@ is_local: if (local) { _tlbiel_pid(pid, RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); - else - _tlbie_pid(pid, RIC_FLUSH_TLB); + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + _tlbie_pid(pid, RIC_FLUSH_ALL); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_TLB); + } } } else { bool hflush = flush_all_sizes; @@ -707,8 +837,8 @@ is_local: gflush = false; } - asm volatile("ptesync": : :"memory"); if (local) { + asm volatile("ptesync": : :"memory"); __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, @@ -717,7 +847,8 @@ is_local: __tlbiel_va_range(gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); - } else { + } else if (cputlb_use_tlbie()) { + asm volatile("ptesync": : :"memory"); __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, @@ -727,6 +858,15 @@ is_local: PUD_SIZE, MMU_PAGE_1G); fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } else { + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, mmu_virtual_psize, false); + if (hflush) + _tlbiel_va_range_multicast(mm, + hstart, hend, pid, PMD_SIZE, MMU_PAGE_2M, false); + if (gflush) + _tlbiel_va_range_multicast(mm, + gstart, gend, pid, PUD_SIZE, MMU_PAGE_1G, false); } } preempt_enable(); @@ -903,16 +1043,26 @@ is_local: if (local) { _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } else { - if (mm_needs_flush_escalation(mm)) - also_pwc = true; + if (cputlb_use_tlbie()) { + if (mm_needs_flush_escalation(mm)) + also_pwc = true; + + _tlbie_pid(pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } else { + _tlbiel_pid_multicast(mm, pid, + also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); + } - _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); } } else { if (local) _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); - else + else if (cputlb_use_tlbie()) _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); + else + _tlbiel_va_range_multicast(mm, + start, end, pid, page_size, psize, also_pwc); } preempt_enable(); } @@ -954,7 +1104,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) exit_flush_lazy_tlbs(mm); goto local; } - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + if (cputlb_use_tlbie()) + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); + else + _tlbiel_va_range_multicast(mm, + addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } else { local: _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c index 482a2c1b340a..43b312d06e3e 100644 --- a/drivers/misc/cxl/main.c +++ b/drivers/misc/cxl/main.c @@ -18,6 +18,7 @@ #include #include +#include #include #include "cxl.h" @@ -315,6 +316,9 @@ static int __init init_cxl(void) { int rc = 0; + if (!tlbie_capable) + return -EINVAL; + if ((rc = cxl_file_init())) return rc; diff --git a/drivers/misc/ocxl/main.c b/drivers/misc/ocxl/main.c index 7210d9e059be..ef73cf35dda2 100644 --- a/drivers/misc/ocxl/main.c +++ b/drivers/misc/ocxl/main.c @@ -2,12 +2,16 @@ // Copyright 2017 IBM Corp. #include #include +#include #include "ocxl_internal.h" static int __init init_ocxl(void) { int rc = 0; + if (!tlbie_capable) + return -EINVAL; + rc = ocxl_file_init(); if (rc) return rc; -- cgit v1.2.3 From 41ba17f20ea835c489e77bd54e2da73184e22060 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Tue, 27 Aug 2019 15:46:35 +0530 Subject: powerpc/imc: Dont create debugfs files for cpu-less nodes Commit <684d984038aa> ('powerpc/powernv: Add debugfs interface for imc-mode and imc') added debugfs interface for the nest imc pmu devices to support changing of different ucode modes. Primarily adding this capability for debug. But when doing so, the code did not consider the case of cpu-less nodes. So when reading the _cmd_ or _mode_ file of a cpu-less node will create this crash. Faulting instruction address: 0xc0000000000d0d58 Oops: Kernel access of bad area, sig: 11 [#1] ... CPU: 67 PID: 5301 Comm: cat Not tainted 5.2.0-rc6-next-20190627+ #19 NIP: c0000000000d0d58 LR: c00000000049aa18 CTR:c0000000000d0d50 REGS: c00020194548f9e0 TRAP: 0300 Not tainted (5.2.0-rc6-next-20190627+) MSR: 9000000000009033 CR:28022822 XER: 00000000 CFAR: c00000000049aa14 DAR: 000000000003fc08 DSISR:40000000 IRQMASK: 0 ... NIP imc_mem_get+0x8/0x20 LR simple_attr_read+0x118/0x170 Call Trace: simple_attr_read+0x70/0x170 (unreliable) debugfs_attr_read+0x6c/0xb0 __vfs_read+0x3c/0x70 vfs_read+0xbc/0x1a0 ksys_read+0x7c/0x140 system_call+0x5c/0x70 Patch fixes the issue with a more robust check for vbase to NULL. Before patch, ls output for the debugfs imc directory # ls /sys/kernel/debug/powerpc/imc/ imc_cmd_0 imc_cmd_251 imc_cmd_253 imc_cmd_255 imc_mode_0 imc_mode_251 imc_mode_253 imc_mode_255 imc_cmd_250 imc_cmd_252 imc_cmd_254 imc_cmd_8 imc_mode_250 imc_mode_252 imc_mode_254 imc_mode_8 After patch, ls output for the debugfs imc directory # ls /sys/kernel/debug/powerpc/imc/ imc_cmd_0 imc_cmd_8 imc_mode_0 imc_mode_8 Actual bug here is that, we have two loops with potentially different loop counts. That is, in imc_get_mem_addr_nest(), loop count is obtained from the dt entries. But in case of export_imc_mode_and_cmd(), loop was based on for_each_nid() count. Patch fixes the loop count in latter based on the struct mem_info. Ideally it would be better to have array size in struct imc_pmu. Fixes: 684d984038aa ('powerpc/powernv: Add debugfs interface for imc-mode and imc') Reported-by: Qian Cai Suggested-by: Michael Ellerman Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190827101635.6942-1-maddy@linux.vnet.ibm.com --- arch/powerpc/platforms/powernv/opal-imc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 186109bdd41b..e04b20625cb9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -53,9 +53,9 @@ static void export_imc_mode_and_cmd(struct device_node *node, struct imc_pmu *pmu_ptr) { static u64 loc, *imc_mode_addr, *imc_cmd_addr; - int chip = 0, nid; char mode[16], cmd[16]; u32 cb_offset; + struct imc_mem_info *ptr = pmu_ptr->mem_info; imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); @@ -69,20 +69,20 @@ static void export_imc_mode_and_cmd(struct device_node *node, if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; - for_each_node(nid) { - loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; + while (ptr->vbase != NULL) { + loc = (u64)(ptr->vbase) + cb_offset; imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET); - sprintf(mode, "imc_mode_%d", nid); + sprintf(mode, "imc_mode_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent, imc_mode_addr)) goto err; imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET); - sprintf(cmd, "imc_cmd_%d", nid); + sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id)); if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent, imc_cmd_addr)) goto err; - chip++; + ptr++; } return; -- cgit v1.2.3 From 67c87892e2e17b7083cb8b4289ed8ff69ad9ac1e Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Tue, 13 Aug 2019 15:12:12 +1000 Subject: powerpc: Remove empty comment Commit 2874c5fd2842 ("treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152") left an empty comment in machdep.h, as the boilerplate was the only text in the comment. Remove the empty comment. Signed-off-by: Jordan Niethe Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190813051212.6387-1-jniethe5@gmail.com --- arch/powerpc/include/asm/machdep.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 657ec893bdcb..7bcb64444a39 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -3,9 +3,6 @@ #define _ASM_POWERPC_MACHDEP_H #ifdef __KERNEL__ -/* - */ - #include #include #include -- cgit v1.2.3 From aa497d4352414aad22e792b35d0aaaa12bbc37c5 Mon Sep 17 00:00:00 2001 From: Segher Boessenkool Date: Wed, 4 Sep 2019 14:11:07 +0000 Subject: powerpc: Add attributes for setjmp/longjmp The setjmp function should be declared as "returns_twice", or bad things can happen[1]. This does not actually change generated code in my testing. The longjmp function should be declared as "noreturn", so that the compiler can optimise calls to it better. This makes the generated code a little shorter. 1: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-returns_005ftwice-function-attribute Signed-off-by: Segher Boessenkool Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c02ce4a573f3bac907e2c70957a2d1275f910013.1567605586.git.segher@kernel.crashing.org --- arch/powerpc/include/asm/setjmp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/setjmp.h b/arch/powerpc/include/asm/setjmp.h index d995061f5f86..e9f81bb3f83b 100644 --- a/arch/powerpc/include/asm/setjmp.h +++ b/arch/powerpc/include/asm/setjmp.h @@ -7,7 +7,7 @@ #define JMP_BUF_LEN 23 -extern long setjmp(long *); -extern void longjmp(long *, long); +extern long setjmp(long *) __attribute__((returns_twice)); +extern void longjmp(long *, long) __attribute__((noreturn)); #endif /* _ASM_POWERPC_SETJMP_H */ -- cgit v1.2.3 From 20055a8bfaaa75f2cb9c23ecc8ab12b4abd8dc84 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Wed, 4 Sep 2019 03:13:58 +0530 Subject: powerpc/memcpy: Fix stack corruption for smaller sizes For sizes lesser than 128 bytes, the code branches out early without saving the stack frame, which when restored later drops frame of the caller. Tested-by: Aneesh Kumar K.V Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903214359.23887-1-santosh@fossix.org --- arch/powerpc/lib/memcpy_mcsafe_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S b/arch/powerpc/lib/memcpy_mcsafe_64.S index 949976dc115d..cb882d9a6d8a 100644 --- a/arch/powerpc/lib/memcpy_mcsafe_64.S +++ b/arch/powerpc/lib/memcpy_mcsafe_64.S @@ -84,7 +84,6 @@ err1; stw r0,0(r3) 3: sub r5,r5,r6 cmpldi r5,128 - blt 5f mflr r0 stdu r1,-STACKFRAMESIZE(r1) @@ -99,6 +98,7 @@ err1; stw r0,0(r3) std r22,STK_REG(R22)(r1) std r0,STACKFRAMESIZE+16(r1) + blt 5f srdi r6,r5,7 mtctr r6 -- cgit v1.2.3 From 6f62a8223e65c0571e48225d5d7e56de95225bae Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Wed, 4 Sep 2019 03:13:59 +0530 Subject: seltests/powerpc: Add a selftest for memcpy_mcsafe Appropriate self tests for memcpy_mcsafe Suggested-by: Michael Ellerman Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903214359.23887-2-santosh@fossix.org --- tools/testing/selftests/powerpc/copyloops/.gitignore | 1 + tools/testing/selftests/powerpc/copyloops/Makefile | 7 ++++++- tools/testing/selftests/powerpc/copyloops/asm/export.h | 1 + tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) create mode 120000 tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore index de158104912a..12ef5b031974 100644 --- a/tools/testing/selftests/powerpc/copyloops/.gitignore +++ b/tools/testing/selftests/powerpc/copyloops/.gitignore @@ -11,3 +11,4 @@ memcpy_p7_t1 copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2 +memcpy_mcsafe_64 diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 44574f3818b3..0917983a1c78 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -12,7 +12,7 @@ ASFLAGS = $(CFLAGS) -Wa,-mpower4 TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ copyuser_p7_t0 copyuser_p7_t1 \ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ - memcpy_p7_t0 memcpy_p7_t1 \ + memcpy_p7_t0 memcpy_p7_t1 memcpy_mcsafe_64 \ copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2 EXTRA_SOURCES := validate.c ../harness.c stubs.S @@ -45,6 +45,11 @@ $(OUTPUT)/memcpy_p7_t%: memcpy_power7.S $(EXTRA_SOURCES) -D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \ -o $@ $^ +$(OUTPUT)/memcpy_mcsafe_64: memcpy_mcsafe_64.S $(EXTRA_SOURCES) + $(CC) $(CPPFLAGS) $(CFLAGS) \ + -D COPY_LOOP=test_memcpy_mcsafe \ + -o $@ $^ + $(OUTPUT)/copyuser_64_exc_t%: copyuser_64.S exc_validate.c ../harness.c \ copy_tofrom_user_reference.S stubs.S $(CC) $(CPPFLAGS) $(CFLAGS) \ diff --git a/tools/testing/selftests/powerpc/copyloops/asm/export.h b/tools/testing/selftests/powerpc/copyloops/asm/export.h index 05c1663c89b0..e6b80d5fbd14 100644 --- a/tools/testing/selftests/powerpc/copyloops/asm/export.h +++ b/tools/testing/selftests/powerpc/copyloops/asm/export.h @@ -1,3 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define EXPORT_SYMBOL(x) +#define EXPORT_SYMBOL_GPL(x) #define EXPORT_SYMBOL_KASAN(x) diff --git a/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S new file mode 120000 index 000000000000..f0feef3062f6 --- /dev/null +++ b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S @@ -0,0 +1 @@ +../../../../../arch/powerpc/lib/memcpy_mcsafe_64.S \ No newline at end of file -- cgit v1.2.3 From b4d37a7b6934c0c16930c73f43c08e9a6af22f1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Sep 2019 18:51:47 +0200 Subject: powerpc/powernv: Remove unused pnv_npu_try_dma_set_bypass() function Neither pnv_npu_try_dma_set_bypass() nor the pnv_npu_dma_set_32() and pnv_npu_dma_set_bypass() helpers called by it are used anywhere in the kernel tree, so remove them. mpe: They're unused since 2d6ad41b2c21 ("powerpc/powernv: use the generic iommu bypass code") removed the last usage. Signed-off-by: Christoph Hellwig Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190903165147.11099-1-hch@lst.de --- arch/powerpc/platforms/powernv/npu-dma.c | 99 -------------------------------- 1 file changed, 99 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index c16249d251f1..a570a249edc3 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -192,105 +192,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) return 0; } -/* - * Enables 32 bit DMA on NPU. - */ -static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) -{ - struct pci_dev *gpdev; - struct pnv_ioda_pe *gpe; - int64_t rc; - - /* - * Find the assoicated PCI devices and get the dma window - * information from there. - */ - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - rc = pnv_npu_set_window(&npe->table_group, 0, - gpe->table_group.tables[0]); - - /* - * NVLink devices use the same TCE table configuration as - * their parent device so drivers shouldn't be doing DMA - * operations directly on these devices. - */ - set_dma_ops(&npe->pdev->dev, &dma_dummy_ops); -} - -/* - * Enables bypass mode on the NPU. The NPU only supports one - * window per link, so bypass needs to be explicitly enabled or - * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be - * active at the same time. - */ -static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - int64_t rc = 0; - phys_addr_t top = memblock_end_of_DRAM(); - - if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) - return -EINVAL; - - rc = pnv_npu_unset_window(&npe->table_group, 0); - if (rc != OPAL_SUCCESS) - return rc; - - /* Enable the bypass window */ - - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - 0 /* bypass base */, top); - - if (rc == OPAL_SUCCESS) - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - return rc; -} - -void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) -{ - int i; - struct pnv_phb *phb; - struct pci_dn *pdn; - struct pnv_ioda_pe *npe; - struct pci_dev *npdev; - - for (i = 0; ; ++i) { - npdev = pnv_pci_get_npu_dev(gpdev, i); - - if (!npdev) - break; - - pdn = pci_get_pdn(npdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return; - - phb = pci_bus_to_host(npdev->bus)->private_data; - - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - - if (bypass) { - dev_info(&npdev->dev, - "Using 64-bit DMA iommu bypass\n"); - pnv_npu_dma_set_bypass(npe); - } else { - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); - pnv_npu_dma_set_32(npe); - } - } -} - #ifdef CONFIG_IOMMU_API /* Switch ownership from platform code to external user (e.g. VFIO) */ static void pnv_npu_take_ownership(struct iommu_table_group *table_group) -- cgit v1.2.3 From 2be1d5d147955e6aea273dc73a9f0ae4510fd225 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 26 Aug 2019 12:27:00 +0530 Subject: powerpc/powernv: Enhance opal message read interface Use "opal-msg-size" device tree property to allocate memory for "opal_msg". Signed-off-by: Vasant Hegde [mpe: s/uint32_t/u32/ and mark opal_msg_size as __ro_after_init] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190826065701.8853-1-hegdevasant@linux.vnet.ibm.com --- arch/powerpc/platforms/powernv/opal.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index d271accf224b..38e90270280b 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -58,6 +58,8 @@ static DEFINE_SPINLOCK(opal_write_lock); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static uint32_t opal_heartbeat; static struct task_struct *kopald_tsk; +static struct opal_msg *opal_msg; +static u32 opal_msg_size __ro_after_init; void opal_configure_cores(void) { @@ -271,14 +273,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg) static void opal_handle_message(void) { s64 ret; - /* - * TODO: pre-allocate a message buffer depending on opal-msg-size - * value in /proc/device-tree. - */ - static struct opal_msg msg; u32 type; - ret = opal_get_msg(__pa(&msg), sizeof(msg)); + ret = opal_get_msg(__pa(opal_msg), opal_msg_size); /* No opal message pending. */ if (ret == OPAL_RESOURCE) return; @@ -290,14 +287,14 @@ static void opal_handle_message(void) return; } - type = be32_to_cpu(msg.msg_type); + type = be32_to_cpu(opal_msg->msg_type); /* Sanity check */ if (type >= OPAL_MSG_TYPE_MAX) { pr_warn_once("%s: Unknown message type: %u\n", __func__, type); return; } - opal_message_do_notify(type, (void *)&msg); + opal_message_do_notify(type, (void *)opal_msg); } static irqreturn_t opal_message_notify(int irq, void *data) @@ -306,10 +303,24 @@ static irqreturn_t opal_message_notify(int irq, void *data) return IRQ_HANDLED; } -static int __init opal_message_init(void) +static int __init opal_message_init(struct device_node *opal_node) { int ret, i, irq; + ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size); + if (ret) { + pr_notice("Failed to read opal-msg-size property\n"); + opal_msg_size = sizeof(struct opal_msg); + } + + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + if (!opal_msg) { + opal_msg_size = sizeof(struct opal_msg); + /* Try to allocate fixed message size */ + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + BUG_ON(opal_msg == NULL); + } + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); @@ -913,7 +924,7 @@ static int __init opal_init(void) } /* Initialise OPAL messaging system */ - opal_message_init(); + opal_message_init(opal_node); /* Initialise OPAL asynchronous completion interface */ opal_async_comp_init(); -- cgit v1.2.3 From 587164cd593c2cbdad376179f85159e31221989e Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Mon, 26 Aug 2019 12:27:01 +0530 Subject: powerpc/powernv: Add new opal message type We have OPAL_MSG_PRD message type to pass prd related messages from OPAL to `opal-prd`. It can handle messages upto 64 bytes. We have a requirement to send bigger than 64 bytes of data from OPAL to `opal-prd`. Lets add new message type (OPAL_MSG_PRD2) to pass bigger data. Signed-off-by: Vasant Hegde [mpe: Make the error string clear that it's the PRD2 event that failed] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190826065701.8853-2-hegdevasant@linux.vnet.ibm.com --- arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/platforms/powernv/opal-prd.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 383242eb0dea..1cad413e1e0e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -453,6 +453,7 @@ enum opal_msg_type { OPAL_MSG_DPO = 5, OPAL_MSG_PRD = 6, OPAL_MSG_OCC = 7, + OPAL_MSG_PRD2 = 8, OPAL_MSG_TYPE_MAX, }; diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index e072bf157d62..45f4223a790f 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -342,7 +342,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, int msg_size, item_size; unsigned long flags; - if (msg_type != OPAL_MSG_PRD) + if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2) return 0; /* Calculate total size of the message and item we need to store. The @@ -393,6 +393,12 @@ static int opal_prd_probe(struct platform_device *pdev) return rc; } + rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb); + if (rc) { + pr_err("Couldn't register PRD2 event notifier\n"); + return rc; + } + rc = misc_register(&opal_prd_dev); if (rc) { pr_err("failed to register miscdev\n"); -- cgit v1.2.3 From bc01bdf6c5df5023272a7399962cf64f8fedc93e Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Tue, 10 Sep 2019 18:45:13 +0530 Subject: powerpc/watchpoint: Disable watchpoint hit by larx/stcx instructions If watchpoint exception is generated by larx/stcx instructions, the reservation created by larx gets lost while handling exception, and thus stcx instruction always fails. Generally these instructions are used in a while(1) loop, for example spinlocks. And because stcx never succeeds, it loops forever and ultimately hangs the system. Note that ptrace anyway works in one-shot mode and thus for ptrace we don't change the behaviour. It's up to ptrace user to take care of this. Signed-off-by: Ravi Bangoria Acked-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190910131513.30499-1-ravi.bangoria@linux.ibm.com --- arch/powerpc/kernel/hw_breakpoint.c | 49 +++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 28ad3171bb82..1007ec36b4cb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -195,14 +195,32 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) tsk->thread.last_hit_ubp = NULL; } +static bool is_larx_stcx_instr(struct pt_regs *regs, unsigned int instr) +{ + int ret, type; + struct instruction_op op; + + ret = analyse_instr(&op, regs, instr); + type = GETTYPE(op.type); + return (!ret && (type == LARX || type == STCX)); +} + /* * Handle debug exception notifications. */ static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, unsigned long addr) { - int stepped; - unsigned int instr; + unsigned int instr = 0; + + if (__get_user_inatomic(instr, (unsigned int *)regs->nip)) + goto fail; + + if (is_larx_stcx_instr(regs, instr)) { + printk_ratelimited("Breakpoint hit on instruction that can't be emulated." + " Breakpoint at 0x%lx will be disabled.\n", addr); + goto disable; + } /* Do not emulate user-space instructions, instead single-step them */ if (user_mode(regs)) { @@ -211,23 +229,22 @@ static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, return false; } - stepped = 0; - instr = 0; - if (!__get_user_inatomic(instr, (unsigned int *)regs->nip)) - stepped = emulate_step(regs, instr); + if (!emulate_step(regs, instr)) + goto fail; + return true; + +fail: /* - * emulate_step() could not execute it. We've failed in reliably - * handling the hw-breakpoint. Unregister it and throw a warning - * message to let the user know about it. + * We've failed in reliably handling the hw-breakpoint. Unregister + * it and throw a warning message to let the user know about it. */ - if (!stepped) { - WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " - "0x%lx will be disabled.", addr); - perf_event_disable_inatomic(bp); - return false; - } - return true; + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " + "0x%lx will be disabled.", addr); + +disable: + perf_event_disable_inatomic(bp); + return false; } int hw_breakpoint_handler(struct die_args *args) -- cgit v1.2.3 From 92c94dfb69e350471473fd3075c74bc68150879e Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 10 Sep 2019 17:52:44 -0500 Subject: powerpc/pseries: correctly track irq state in default idle prep_irq_for_idle() is intended to be called before entering H_CEDE (and it is used by the pseries cpuidle driver). However the default pseries idle routine does not call it, leading to mismanaged lazy irq state when the cpuidle driver isn't in use. Manifestations of this include: * Dropped IPIs in the time immediately after a cpu comes online (before it has installed the cpuidle handler), making the online operation block indefinitely waiting for the new cpu to respond. * Hitting this WARN_ON in arch_local_irq_restore(): /* * We should already be hard disabled here. We had bugs * where that wasn't the case so let's dbl check it and * warn if we are wrong. Only do that when IRQ tracing * is enabled as mfmsr() can be costly. */ if (WARN_ON_ONCE(mfmsr() & MSR_EE)) __hard_irq_disable(); Call prep_irq_for_idle() from pseries_lpar_idle() and honor its result. Fixes: 363edbe2614a ("powerpc: Default arch idle could cede processor on pseries") Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190910225244.25056-1-nathanl@linux.ibm.com --- arch/powerpc/platforms/pseries/setup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index b955d54628ff..f8adcd0e4589 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -321,6 +321,9 @@ static void pseries_lpar_idle(void) * low power mode by ceding processor to hypervisor */ + if (!prep_irq_for_idle()) + return; + /* Indicate to hypervisor that we are idle. */ get_lppaca()->idle = 1; -- cgit v1.2.3 From 6ccb4ac2bf8a35c694ead92f8ac5530a16e8f2c8 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 11 Sep 2019 17:52:18 +0200 Subject: powerpc/xive: Fix bogus error code returned by OPAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a bug in skiboot that causes the OPAL_XIVE_ALLOCATE_IRQ call to return the 32-bit value 0xffffffff when OPAL has run out of IRQs. Unfortunatelty, OPAL return values are signed 64-bit entities and errors are supposed to be negative. If that happens, the linux code confusingly treats 0xffffffff as a valid IRQ number and panics at some point. A fix was recently merged in skiboot: e97391ae2bb5 ("xive: fix return value of opal_xive_allocate_irq()") but we need a workaround anyway to support older skiboots already in the field. Internally convert 0xffffffff to OPAL_RESOURCE which is the usual error returned upon resource exhaustion. Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821713818.1985334.14123187368108582810.stgit@bahia.lan --- arch/powerpc/include/asm/opal.h | 2 +- arch/powerpc/platforms/powernv/opal-call.c | 2 +- arch/powerpc/sysdev/xive/native.c | 11 +++++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 57bd029c715e..d5a0807d21db 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -272,7 +272,7 @@ int64_t opal_xive_get_vp_info(uint64_t vp, int64_t opal_xive_set_vp_info(uint64_t vp, uint64_t flags, uint64_t report_cl_pair); -int64_t opal_xive_allocate_irq(uint32_t chip_id); +int64_t opal_xive_allocate_irq_raw(uint32_t chip_id); int64_t opal_xive_free_irq(uint32_t girq); int64_t opal_xive_sync(uint32_t type, uint32_t id); int64_t opal_xive_dump(uint32_t type, uint32_t id); diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 29ca523c1c79..dccdc9df5213 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -257,7 +257,7 @@ OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ); OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 4b61e44f0171..e9481468ebd8 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -245,6 +245,17 @@ static bool xive_native_match(struct device_node *node) return of_device_is_compatible(node, "ibm,opal-xive-vc"); } +static s64 opal_xive_allocate_irq(u32 chip_id) +{ + s64 irq = opal_xive_allocate_irq_raw(chip_id); + + /* + * Old versions of skiboot can incorrectly return 0xffffffff to + * indicate no space, fix it up here. + */ + return irq == 0xffffffff ? OPAL_RESOURCE : irq; +} + #ifdef CONFIG_SMP static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) { -- cgit v1.2.3 From 1b7f3b6c43675ef2cfb9d8f48bde057794820f7c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 13 Sep 2019 23:32:13 +1000 Subject: powerpc/eeh: Fix build with STACKTRACE=n The build breaks when STACKTRACE=n, eg. skiroot_defconfig: arch/powerpc/kernel/eeh_event.c:124:23: error: implicit declaration of function 'stack_trace_save' Fix it with some ifdefs for now. Fixes: 25baf3d81614 ("powerpc/eeh: Defer printing stack trace") Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 2 ++ arch/powerpc/kernel/eeh_driver.c | 3 ++- arch/powerpc/kernel/eeh_event.c | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 9d0e1694a94d..6f9b2a12540a 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -89,6 +89,7 @@ struct eeh_pe { struct list_head child; /* Memb. child_list/eeh_phb_pe */ struct list_head edevs; /* List of eeh_dev in this PE */ +#ifdef CONFIG_STACKTRACE /* * Saved stack trace. When we find a PE freeze in eeh_dev_check_failure * the stack trace is saved here so we can print it in the recovery @@ -99,6 +100,7 @@ struct eeh_pe { */ unsigned long stack_trace[64]; int trace_entries; +#endif /* CONFIG_STACKTRACE */ }; #define eeh_pe_for_each_dev(pe, edev, tmp) \ diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 80bd157fcb45..d9279d0ee9f5 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -908,6 +908,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); } +#ifdef CONFIG_STACKTRACE /* * Print the saved stack trace now that we've verified there's * something to recover. @@ -926,7 +927,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pe->trace_entries = 0; } - +#endif /* CONFIG_STACKTRACE */ eeh_pe_update_time_stamp(pe); pe->freeze_count++; diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 1d55486adb0f..a7a8dc182efb 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -117,12 +117,14 @@ int __eeh_send_failure_event(struct eeh_pe *pe) * while the PE is sitting in the event queue. */ if (pe) { +#ifdef CONFIG_STACKTRACE /* * Save the current stack trace so we can dump it from the * event handler thread. */ pe->trace_entries = stack_trace_save(pe->stack_trace, ARRAY_SIZE(pe->stack_trace), 0); +#endif /* CONFIG_STACKTRACE */ eeh_pe_state_mark(pe, EEH_PE_RECOVERING); } -- cgit v1.2.3 From 79cb6879135b7441f0955ca3f86be8c8d21c7199 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 13 Sep 2019 23:50:06 +1000 Subject: powerpc/powernv: Fix build with IOMMU_API=n The builds breaks when IOMMU_API=n, eg. skiroot_defconfig: arch/powerpc/platforms/powernv/npu-dma.c:96:28: error: 'get_gpu_pci_dev_and_pe' defined but not used arch/powerpc/platforms/powernv/npu-dma.c:126:13: error: 'pnv_npu_set_window' defined but not used Fixes: b4d37a7b6934 ("powerpc/powernv: Remove unused pnv_npu_try_dma_set_bypass() function") Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index a570a249edc3..b95b9e3c4c98 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -89,6 +89,7 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) } EXPORT_SYMBOL(pnv_pci_get_npu_dev); +#ifdef CONFIG_IOMMU_API /* * Returns the PE assoicated with the PCI device of the given * NPU. Returns the linked pci device if pci_dev != NULL. @@ -192,7 +193,6 @@ static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) return 0; } -#ifdef CONFIG_IOMMU_API /* Switch ownership from platform code to external user (e.g. VFIO) */ static void pnv_npu_take_ownership(struct iommu_table_group *table_group) { -- cgit v1.2.3 From 0cb0837f9db1a6ed5b764ef61dd5f1a314b8231a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 Sep 2019 21:57:43 +1000 Subject: powerpc/kvm: Move kvm_tmp into .text, shrink to 64K In some configurations of KVM, guests binary patch themselves to avoid/reduce trapping into the hypervisor. For some instructions this requires replacing one instruction with a sequence of instructions. For those cases we need to write the sequence of instructions somewhere and then patch the location of the original instruction to branch to the sequence. That requires that the location of the sequence be within 32MB of the original instruction. The current solution for this is that we create a 1MB array in BSS, write sequences into there, and then free the remainder of the array. This has a few problems: - it confuses kmemleak. - it confuses lockdep. - it requires mapping kvm_tmp executable, which can cause adjacent areas to also be mapped executable if we're using 16M pages for the linear mapping. - the 32MB limit can be exceeded if the kernel is big enough, especially with STRICT_KERNEL_RWX enabled, which then prevents the patching from working at all. We can fix all those problems by making kvm_tmp just a region of regular .text. However currently it's 1MB in size, and we don't want to waste 1MB of text. In practice however I only see ~30KB of kvm_tmp being used even for an allyes_config. So shrink kvm_tmp to 64K, which ought to be enough for everyone, and move it into .text. Signed-off-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190911115746.12433-1-mpe@ellerman.id.au --- arch/powerpc/kernel/kvm.c | 24 +++++------------------- arch/powerpc/kernel/kvm_emul.S | 8 ++++++++ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index b7b3a5e4e224..e3b5aa583319 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -64,7 +64,8 @@ #define KVM_INST_MTSRIN 0x7c0001e4 static bool kvm_patching_worked = true; -char kvm_tmp[1024 * 1024]; +extern char kvm_tmp[]; +extern char kvm_tmp_end[]; static int kvm_tmp_index; static inline void kvm_patch_ins(u32 *inst, u32 new_inst) @@ -132,7 +133,7 @@ static u32 *kvm_alloc(int len) { u32 *p; - if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) { + if ((kvm_tmp_index + len) > (kvm_tmp_end - kvm_tmp)) { printk(KERN_ERR "KVM: No more space (%d + %d)\n", kvm_tmp_index, len); kvm_patching_worked = false; @@ -699,25 +700,13 @@ static void kvm_use_magic_page(void) kvm_patching_worked ? "worked" : "failed"); } -static __init void kvm_free_tmp(void) -{ - /* - * Inform kmemleak about the hole in the .bss section since the - * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y. - */ - kmemleak_free_part(&kvm_tmp[kvm_tmp_index], - ARRAY_SIZE(kvm_tmp) - kvm_tmp_index); - free_reserved_area(&kvm_tmp[kvm_tmp_index], - &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); -} - static int __init kvm_guest_init(void) { if (!kvm_para_available()) - goto free_tmp; + return 0; if (!epapr_paravirt_enabled) - goto free_tmp; + return 0; if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) kvm_use_magic_page(); @@ -727,9 +716,6 @@ static int __init kvm_guest_init(void) powersave_nap = 1; #endif -free_tmp: - kvm_free_tmp(); - return 0; } diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index eb2568f583ae..9dd17dce10a1 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -334,5 +334,13 @@ kvm_emulate_mtsrin_orig_ins_offs: kvm_emulate_mtsrin_len: .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 + .balign 4 + .global kvm_tmp +kvm_tmp: + .space (64 * 1024) + +.global kvm_tmp_end +kvm_tmp_end: + .global kvm_template_end kvm_template_end: -- cgit v1.2.3 From dac39f788546e2eb9838eca551ccd6fd413e75c7 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 Sep 2019 21:57:44 +1000 Subject: powerpc/64s: Remove overlaps_kvm_tmp() kvm_tmp is now in .text and so doesn't need a special overlap check. Signed-off-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190911115746.12433-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/sections.h | 11 ----------- arch/powerpc/mm/book3s64/hash_utils.c | 4 ---- 2 files changed, 15 deletions(-) diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 4a1664a8658d..5a9b6eb651b6 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -61,17 +61,6 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end) (unsigned long)_stext < end; } -static inline int overlaps_kvm_tmp(unsigned long start, unsigned long end) -{ -#ifdef CONFIG_KVM_GUEST - extern char kvm_tmp[]; - return start < (unsigned long)kvm_tmp && - (unsigned long)&kvm_tmp[1024 * 1024] < end; -#else - return 0; -#endif -} - #ifdef PPC64_ELF_ABI_v1 #define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1 diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 7684a596158b..3410ea9f4de1 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -273,10 +273,6 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; - /* Make kvm guest trampolines executable */ - if (overlaps_kvm_tmp(vaddr, vaddr + step)) - tprot &= ~HPTE_R_N; - /* * If relocatable, check if it overlaps interrupt vectors that * are copied down to real 0. For relocatable kernel -- cgit v1.2.3 From 731dade128ebc35044e7f9b9d396e4c1bed6ecbc Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 Sep 2019 21:57:45 +1000 Subject: powerpc/kvm: Explicitly mark kvm guest code as __init All the code in kvm.c can be marked __init. Most of it is already inlined into the initcall, but not all. So instead of relying on the inlining, mark it all as __init. This saves ~280 bytes of text for my configuration. Signed-off-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190911115746.12433-3-mpe@ellerman.id.au --- arch/powerpc/kernel/kvm.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index e3b5aa583319..617eba82531c 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -68,13 +68,13 @@ extern char kvm_tmp[]; extern char kvm_tmp_end[]; static int kvm_tmp_index; -static inline void kvm_patch_ins(u32 *inst, u32 new_inst) +static void __init kvm_patch_ins(u32 *inst, u32 new_inst) { *inst = new_inst; flush_icache_range((ulong)inst, (ulong)inst + 4); } -static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); @@ -83,7 +83,7 @@ static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); @@ -92,12 +92,12 @@ static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) { kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff)); } -static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_std(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc)); @@ -106,17 +106,17 @@ static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) { kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc)); } -static void kvm_patch_ins_nop(u32 *inst) +static void __init kvm_patch_ins_nop(u32 *inst) { kvm_patch_ins(inst, KVM_INST_NOP); } -static void kvm_patch_ins_b(u32 *inst, int addr) +static void __init kvm_patch_ins_b(u32 *inst, int addr) { #if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S) /* On relocatable kernels interrupts handlers and our code @@ -129,7 +129,7 @@ static void kvm_patch_ins_b(u32 *inst, int addr) kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK)); } -static u32 *kvm_alloc(int len) +static u32 * __init kvm_alloc(int len) { u32 *p; @@ -152,7 +152,7 @@ extern u32 kvm_emulate_mtmsrd_orig_ins_offs; extern u32 kvm_emulate_mtmsrd_len; extern u32 kvm_emulate_mtmsrd[]; -static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) +static void __init kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) { u32 *p; int distance_start; @@ -205,7 +205,7 @@ extern u32 kvm_emulate_mtmsr_orig_ins_offs; extern u32 kvm_emulate_mtmsr_len; extern u32 kvm_emulate_mtmsr[]; -static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) +static void __init kvm_patch_ins_mtmsr(u32 *inst, u32 rt) { u32 *p; int distance_start; @@ -266,7 +266,7 @@ extern u32 kvm_emulate_wrtee_orig_ins_offs; extern u32 kvm_emulate_wrtee_len; extern u32 kvm_emulate_wrtee[]; -static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) +static void __init kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) { u32 *p; int distance_start; @@ -323,7 +323,7 @@ extern u32 kvm_emulate_wrteei_0_branch_offs; extern u32 kvm_emulate_wrteei_0_len; extern u32 kvm_emulate_wrteei_0[]; -static void kvm_patch_ins_wrteei_0(u32 *inst) +static void __init kvm_patch_ins_wrteei_0(u32 *inst) { u32 *p; int distance_start; @@ -364,7 +364,7 @@ extern u32 kvm_emulate_mtsrin_orig_ins_offs; extern u32 kvm_emulate_mtsrin_len; extern u32 kvm_emulate_mtsrin[]; -static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) +static void __init kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) { u32 *p; int distance_start; @@ -400,7 +400,7 @@ static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) #endif -static void kvm_map_magic_page(void *data) +static void __init kvm_map_magic_page(void *data) { u32 *features = data; @@ -415,7 +415,7 @@ static void kvm_map_magic_page(void *data) *features = out[0]; } -static void kvm_check_ins(u32 *inst, u32 features) +static void __init kvm_check_ins(u32 *inst, u32 features) { u32 _inst = *inst; u32 inst_no_rt = _inst & ~KVM_MASK_RT; @@ -659,7 +659,7 @@ static void kvm_check_ins(u32 *inst, u32 features) extern u32 kvm_template_start[]; extern u32 kvm_template_end[]; -static void kvm_use_magic_page(void) +static void __init kvm_use_magic_page(void) { u32 *p; u32 *start, *end; -- cgit v1.2.3 From caff52dc0b71538cff43c06ede3621fbbf359978 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 Sep 2019 21:57:46 +1000 Subject: powerpc/kvm: Add ifdefs around template code Some of the templates used for KVM patching are only used on certain platforms, but currently they are always built-in, fix that. Signed-off-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190911115746.12433-4-mpe@ellerman.id.au --- arch/powerpc/kernel/kvm_emul.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index 9dd17dce10a1..7af6f8b50c5d 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -192,6 +192,8 @@ kvm_emulate_mtmsr_orig_ins_offs: kvm_emulate_mtmsr_len: .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 +#ifdef CONFIG_BOOKE + /* also used for wrteei 1 */ .global kvm_emulate_wrtee kvm_emulate_wrtee: @@ -285,6 +287,10 @@ kvm_emulate_wrteei_0_branch_offs: kvm_emulate_wrteei_0_len: .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4 +#endif /* CONFIG_BOOKE */ + +#ifdef CONFIG_PPC_BOOK3S_32 + .global kvm_emulate_mtsrin kvm_emulate_mtsrin: @@ -334,6 +340,8 @@ kvm_emulate_mtsrin_orig_ins_offs: kvm_emulate_mtsrin_len: .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 +#endif /* CONFIG_PPC_BOOK3S_32 */ + .balign 4 .global kvm_tmp kvm_tmp: -- cgit v1.2.3 From 1fdfa4c6af0cc1854b017f308af6bece94568bb6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 12 Sep 2019 16:40:37 +0900 Subject: powerpc: improve prom_init_check rule This slightly improves the prom_init_check rule. [1] Avoid needless check Currently, prom_init_check.sh is invoked every time you run 'make' even if you have changed nothing in prom_init.c. With this commit, the script is re-run only when prom_init.o is recompiled. [2] Beautify the build log Currently, the O= build shows the absolute path to the script: CALL /abs/path/to/source/of/linux/arch/powerpc/kernel/prom_init_check.sh With this commit, it is always a relative path to the timestamp file: PROMCHK arch/powerpc/kernel/prom_init_check Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190912074037.13813-1-yamada.masahiro@socionext.com --- arch/powerpc/kernel/.gitignore | 1 + arch/powerpc/kernel/Makefile | 14 ++++++-------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore index c5f676c3c224..67ebd3003c05 100644 --- a/arch/powerpc/kernel/.gitignore +++ b/arch/powerpc/kernel/.gitignore @@ -1 +1,2 @@ +prom_init_check vmlinux.lds diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index c6ae0e7914bc..66c54443187d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -188,15 +188,13 @@ extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o -ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE -$(obj)/built-in.a: prom_init_check +extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check -quiet_cmd_prom_init_check = CALL $< - cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o" +quiet_cmd_prom_init_check = PROMCHK $@ + cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" $(obj)/prom_init.o; touch $@ -PHONY += prom_init_check -prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o - $(call cmd,prom_init_check) -endif +$(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE + $(call if_changed,prom_init_check) +targets += prom_init_check clean-files := vmlinux.lds -- cgit v1.2.3 From ca986d7fa7e7f7b3f018f227b999f35e654fbb79 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:16:21 +0530 Subject: powerpc/fadump: move internal macros/definitions to a new header Though asm/fadump.h is meant to be used by other components dealing with FADump, it also has macros/definitions internal to FADump code. Move them to a new header file used within FADump code. This also makes way for refactoring platform specific FADump code. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821313134.5656.6597770626574392140.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 86 ++++++++++++++++++++++++++++++ arch/powerpc/include/asm/fadump.h | 71 ------------------------ arch/powerpc/kernel/fadump.c | 1 + 3 files changed, 87 insertions(+), 71 deletions(-) create mode 100644 arch/powerpc/include/asm/fadump-internal.h diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h new file mode 100644 index 000000000000..071f377d352f --- /dev/null +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump internal code. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _ASM_POWERPC_FADUMP_INTERNAL_H +#define _ASM_POWERPC_FADUMP_INTERNAL_H + +/* + * The RMA region will be saved for later dumping when kernel crashes. + * RMA is Real Mode Area, the first block of logical memory address owned + * by logical partition, containing the storage that may be accessed with + * translate off. + */ +#define RMA_START 0x0 +#define RMA_END (ppc64_rma_size) + +/* + * On some Power systems where RMO is 128MB, it still requires minimum of + * 256MB for kernel to boot successfully. When kdump infrastructure is + * configured to save vmcore over network, we run into OOM issue while + * loading modules related to network setup. Hence we need additional 64M + * of memory to avoid OOM issue. + */ +#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + + (0x1UL << 26)) + +/* The upper limit percentage for user specified boot memory size (25%) */ +#define MAX_BOOT_MEM_RATIO 4 + +#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) + +/* Alignment per CMA requirement. */ +#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ + max_t(unsigned long, MAX_ORDER - 1, \ + pageblock_order)) + +/* FAD commands */ +#define FADUMP_REGISTER 1 +#define FADUMP_UNREGISTER 2 +#define FADUMP_INVALIDATE 3 + +#define FADUMP_CRASH_INFO_MAGIC str_to_u64("FADMPINF") + +/* fadump crash info structure */ +struct fadump_crash_info_header { + u64 magic_number; + u64 elfcorehdr_addr; + u32 crashing_cpu; + struct pt_regs regs; + struct cpumask online_mask; +}; + +struct fad_crash_memory_ranges { + unsigned long long base; + unsigned long long size; +}; + +/* Firmware-assisted dump configuration details. */ +struct fw_dump { + unsigned long reserve_dump_area_start; + unsigned long reserve_dump_area_size; + /* cmd line option during boot */ + unsigned long reserve_bootvar; + + unsigned long cpu_state_data_size; + unsigned long hpte_region_size; + unsigned long boot_memory_size; + + unsigned long fadumphdr_addr; + unsigned long cpu_notes_buf; + unsigned long cpu_notes_buf_size; + + int ibm_configure_kernel_dump; + + unsigned long fadump_enabled:1; + unsigned long fadump_supported:1; + unsigned long dump_active:1; + unsigned long dump_registered:1; + unsigned long nocma:1; +}; + +#endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */ diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 17d9b6acaf63..751794978427 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -11,34 +11,6 @@ #ifdef CONFIG_FA_DUMP -/* - * The RMA region will be saved for later dumping when kernel crashes. - * RMA is Real Mode Area, the first block of logical memory address owned - * by logical partition, containing the storage that may be accessed with - * translate off. - */ -#define RMA_START 0x0 -#define RMA_END (ppc64_rma_size) - -/* - * On some Power systems where RMO is 128MB, it still requires minimum of - * 256MB for kernel to boot successfully. When kdump infrastructure is - * configured to save vmcore over network, we run into OOM issue while - * loading modules related to network setup. Hence we need aditional 64M - * of memory to avoid OOM issue. - */ -#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ - + (0x1UL << 26)) - -/* The upper limit percentage for user specified boot memory size (25%) */ -#define MAX_BOOT_MEM_RATIO 4 - -#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) - -/* Alignement per CMA requirement. */ -#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ - max_t(unsigned long, MAX_ORDER - 1, pageblock_order)) - /* Firmware provided dump sections */ #define FADUMP_CPU_STATE_DATA 0x0001 #define FADUMP_HPTE_REGION 0x0002 @@ -47,11 +19,6 @@ /* Dump request flag */ #define FADUMP_REQUEST_FLAG 0x00000001 -/* FAD commands */ -#define FADUMP_REGISTER 1 -#define FADUMP_UNREGISTER 2 -#define FADUMP_INVALIDATE 3 - /* Dump status flag */ #define FADUMP_ERROR_FLAG 0x2000 @@ -112,29 +79,6 @@ struct fadump_mem_struct { struct fadump_section rmr_region; }; -/* Firmware-assisted dump configuration details. */ -struct fw_dump { - unsigned long cpu_state_data_size; - unsigned long hpte_region_size; - unsigned long boot_memory_size; - unsigned long reserve_dump_area_start; - unsigned long reserve_dump_area_size; - /* cmd line option during boot */ - unsigned long reserve_bootvar; - - unsigned long fadumphdr_addr; - unsigned long cpu_notes_buf; - unsigned long cpu_notes_buf_size; - - int ibm_configure_kernel_dump; - - unsigned long fadump_enabled:1; - unsigned long fadump_supported:1; - unsigned long dump_active:1; - unsigned long dump_registered:1; - unsigned long nocma:1; -}; - /* * Copy the ascii values for first 8 characters from a string into u64 * variable at their respective indexes. @@ -153,7 +97,6 @@ static inline u64 str_to_u64(const char *str) #define STR_TO_HEX(x) str_to_u64(x) #define REG_ID(x) str_to_u64(x) -#define FADUMP_CRASH_INFO_MAGIC STR_TO_HEX("FADMPINF") #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") /* The firmware-assisted dump format. @@ -178,20 +121,6 @@ struct fadump_reg_entry { __be64 reg_value; }; -/* fadump crash info structure */ -struct fadump_crash_info_header { - u64 magic_number; - u64 elfcorehdr_addr; - u32 crashing_cpu; - struct pt_regs regs; - struct cpumask online_mask; -}; - -struct fad_crash_memory_ranges { - unsigned long long base; - unsigned long long size; -}; - extern int is_fadump_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 4eab97292cc2..7c55044cf9d4 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -30,6 +30,7 @@ #include #include #include +#include #include static struct fw_dump fw_dump; -- cgit v1.2.3 From 961cf26a98648a294de45ea6f806dc84dfc91197 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:16:36 +0530 Subject: powerpc/fadump: add helper functions Add helper functions to setup & free CPU notes buffer and to find if a given memory area is contiguous. Also, use boolean as return type for the function that finds if boot memory area is contiguous. While at it, save the virtual address of CPU notes buffer instead of physical address as virtual address is used often. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821318971.5656.9281936950510635858.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 2 +- arch/powerpc/kernel/fadump.c | 127 +++++++++++++++-------------- 2 files changed, 67 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 071f377d352f..16e2781a3e48 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -71,7 +71,7 @@ struct fw_dump { unsigned long boot_memory_size; unsigned long fadumphdr_addr; - unsigned long cpu_notes_buf; + unsigned long cpu_notes_buf_vaddr; unsigned long cpu_notes_buf_size; int ibm_configure_kernel_dump; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 7c55044cf9d4..e2b83a991303 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -201,64 +201,55 @@ int is_fadump_active(void) } /* - * Returns 1, if there are no holes in boot memory area, - * 0 otherwise. + * Returns true, if there are no holes in memory area between d_start to d_end, + * false otherwise. */ -static int is_boot_memory_area_contiguous(void) +static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) { struct memblock_region *reg; - unsigned long tstart, tend; - unsigned long start_pfn = PHYS_PFN(RMA_START); - unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); - unsigned int ret = 0; + bool ret = false; + u64 start, end; for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); - if (tstart < tend) { - /* Memory hole from start_pfn to tstart */ - if (tstart > start_pfn) + start = max_t(u64, d_start, reg->base); + end = min_t(u64, d_end, (reg->base + reg->size)); + if (d_start < end) { + /* Memory hole from d_start to start */ + if (start > d_start) break; - if (tend == end_pfn) { - ret = 1; + if (end == d_end) { + ret = true; break; } - start_pfn = tend + 1; + d_start = end + 1; } } return ret; } +/* + * Returns true, if there are no holes in boot memory area, + * false otherwise. + */ +static bool is_boot_memory_area_contiguous(void) +{ + return is_fadump_mem_area_contiguous(0, fw_dump.boot_memory_size); +} + /* * Returns true, if there are no holes in reserved memory area, * false otherwise. */ static bool is_reserved_memory_area_contiguous(void) { - struct memblock_region *reg; - unsigned long start, end; - unsigned long d_start = fw_dump.reserve_dump_area_start; - unsigned long d_end = d_start + fw_dump.reserve_dump_area_size; + u64 d_start, d_end; - for_each_memblock(memory, reg) { - start = max(d_start, (unsigned long)reg->base); - end = min(d_end, (unsigned long)(reg->base + reg->size)); - if (d_start < end) { - /* Memory hole from d_start to start */ - if (start > d_start) - break; - - if (end == d_end) - return true; - - d_start = end + 1; - } - } - - return false; + d_start = fw_dump.reserve_dump_area_start; + d_end = d_start + fw_dump.reserve_dump_area_size; + return is_fadump_mem_area_contiguous(d_start, d_end); } /* Print firmware assisted dump configurations for debugging purpose. */ @@ -785,7 +776,7 @@ static void fadump_update_elfcore_header(char *bufp) phdr = (struct elf_phdr *)bufp; if (phdr->p_type == PT_NOTE) { - phdr->p_paddr = fw_dump.cpu_notes_buf; + phdr->p_paddr = __pa(fw_dump.cpu_notes_buf_vaddr); phdr->p_offset = phdr->p_paddr; phdr->p_filesz = fw_dump.cpu_notes_buf_size; phdr->p_memsz = fw_dump.cpu_notes_buf_size; @@ -793,7 +784,7 @@ static void fadump_update_elfcore_header(char *bufp) return; } -static void *fadump_cpu_notes_buf_alloc(unsigned long size) +static void *fadump_alloc_buffer(unsigned long size) { void *vaddr; struct page *page; @@ -811,7 +802,7 @@ static void *fadump_cpu_notes_buf_alloc(unsigned long size) return vaddr; } -static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) +static void fadump_free_buffer(unsigned long vaddr, unsigned long size) { struct page *page; unsigned long order, count, i; @@ -824,6 +815,36 @@ static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) __free_pages(page, order); } +static s32 fadump_setup_cpu_notes_buf(u32 num_cpus) +{ + /* Allocate buffer to hold cpu crash notes. */ + fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); + fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf_vaddr = + (unsigned long)fadump_alloc_buffer(fw_dump.cpu_notes_buf_size); + if (!fw_dump.cpu_notes_buf_vaddr) { + pr_err("Failed to allocate %ld bytes for CPU notes buffer\n", + fw_dump.cpu_notes_buf_size); + return -ENOMEM; + } + + pr_debug("Allocated buffer for cpu notes of size %ld at 0x%lx\n", + fw_dump.cpu_notes_buf_size, + fw_dump.cpu_notes_buf_vaddr); + return 0; +} + +static void fadump_free_cpu_notes_buf(void) +{ + if (!fw_dump.cpu_notes_buf_vaddr) + return; + + fadump_free_buffer(fw_dump.cpu_notes_buf_vaddr, + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf_vaddr = 0; + fw_dump.cpu_notes_buf_size = 0; +} + /* * Read CPU state dump data and convert it into ELF notes. * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be @@ -870,19 +891,11 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) vaddr += sizeof(u32); reg_entry = (struct fadump_reg_entry *)vaddr; - /* Allocate buffer to hold cpu crash notes. */ - fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); - fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); - note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); - if (!note_buf) { - printk(KERN_ERR "Failed to allocate 0x%lx bytes for " - "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); - return -ENOMEM; - } - fw_dump.cpu_notes_buf = __pa(note_buf); + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; - pr_debug("Allocated buffer for cpu notes of size %ld at %p\n", - (num_cpus * sizeof(note_buf_t)), note_buf); + note_buf = (u32 *)fw_dump.cpu_notes_buf_vaddr; if (fw_dump.fadumphdr_addr) fdh = __va(fw_dump.fadumphdr_addr); @@ -920,10 +933,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) return 0; error_out: - fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf), - fw_dump.cpu_notes_buf_size); - fw_dump.cpu_notes_buf = 0; - fw_dump.cpu_notes_buf_size = 0; + fadump_free_cpu_notes_buf(); return rc; } @@ -1470,13 +1480,8 @@ static void fadump_invalidate_release_mem(void) fw_dump.reserve_dump_area_size = get_fadump_area_size(); fadump_release_memory(reserved_area_start, reserved_area_end); - if (fw_dump.cpu_notes_buf) { - fadump_cpu_notes_buf_free( - (unsigned long)__va(fw_dump.cpu_notes_buf), - fw_dump.cpu_notes_buf_size); - fw_dump.cpu_notes_buf = 0; - fw_dump.cpu_notes_buf_size = 0; - } + fadump_free_cpu_notes_buf(); + /* Initialize the kernel dump memory structure for FAD registration. */ init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); } -- cgit v1.2.3 From 7f0ad11d3fb948a0d7770bd38ae17a51413c3dac Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:16:52 +0530 Subject: powerpc/fadump: declare helper functions in internal header file Declare helper functions, that can be reused by multiple platforms, in the internal header file. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821320487.5656.2660730464212209984.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 8 ++++++++ arch/powerpc/kernel/fadump.c | 16 ++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 16e2781a3e48..e004c89570b0 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -83,4 +83,12 @@ struct fw_dump { unsigned long nocma:1; }; +/* Helper functions */ +s32 fadump_setup_cpu_notes_buf(u32 num_cpus); +void fadump_free_cpu_notes_buf(void); +u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs); +void fadump_update_elfcore_header(char *bufp); +bool is_fadump_boot_mem_contiguous(void); +bool is_fadump_reserved_mem_contiguous(void); + #endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index e2b83a991303..eb0745e418db 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -234,7 +234,7 @@ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) * Returns true, if there are no holes in boot memory area, * false otherwise. */ -static bool is_boot_memory_area_contiguous(void) +bool is_fadump_boot_mem_contiguous(void) { return is_fadump_mem_area_contiguous(0, fw_dump.boot_memory_size); } @@ -243,7 +243,7 @@ static bool is_boot_memory_area_contiguous(void) * Returns true, if there are no holes in reserved memory area, * false otherwise. */ -static bool is_reserved_memory_area_contiguous(void) +bool is_fadump_reserved_mem_contiguous(void) { u64 d_start, d_end; @@ -617,9 +617,9 @@ static int register_fw_dump(struct fadump_mem_struct *fdm) " dump. Hardware Error(%d).\n", rc); break; case -3: - if (!is_boot_memory_area_contiguous()) + if (!is_fadump_boot_mem_contiguous()) pr_err("Can't have holes in boot memory area while registering fadump\n"); - else if (!is_reserved_memory_area_contiguous()) + else if (!is_fadump_reserved_mem_contiguous()) pr_err("Can't have holes in reserved memory area while" " registering fadump\n"); @@ -749,7 +749,7 @@ fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) return reg_entry; } -static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -764,7 +764,7 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) return buf; } -static void fadump_update_elfcore_header(char *bufp) +void fadump_update_elfcore_header(char *bufp) { struct elfhdr *elf; struct elf_phdr *phdr; @@ -815,7 +815,7 @@ static void fadump_free_buffer(unsigned long vaddr, unsigned long size) __free_pages(page, order); } -static s32 fadump_setup_cpu_notes_buf(u32 num_cpus) +s32 fadump_setup_cpu_notes_buf(u32 num_cpus) { /* Allocate buffer to hold cpu crash notes. */ fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); @@ -834,7 +834,7 @@ static s32 fadump_setup_cpu_notes_buf(u32 num_cpus) return 0; } -static void fadump_free_cpu_notes_buf(void) +void fadump_free_cpu_notes_buf(void) { if (!fw_dump.cpu_notes_buf_vaddr) return; -- cgit v1.2.3 From 8468d155450c89adbb400cdae42bd4095f3045a9 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:17:07 +0530 Subject: powerpc/fadump: Improve fadump documentation The figures depicting FADump's (Firmware-Assisted Dump) memory layout are missing some finer details like different memory regions and what they represent. Improve the documentation by updating those details. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821322070.5656.8194734198500730487.stgit@hbathini.in.ibm.com --- Documentation/powerpc/firmware-assisted-dump.rst | 61 +++++++++++++----------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index 9ca12830a48e..563c021df539 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -76,8 +76,9 @@ as follows: there is crash data available from a previous boot. During the early boot OS will reserve rest of the memory above boot memory size effectively booting with restricted memory - size. This will make sure that the second kernel will not - touch any of the dump memory area. + size. This will make sure that this kernel (also, referred + to as second kernel or capture kernel) will not touch any + of the dump memory area. - User-space tools will read /proc/vmcore to obtain the contents of memory, which holds the previous crashed kernel dump in ELF @@ -128,42 +129,46 @@ space memory except the user pages that were present in CMA region:: o Memory Reservation during first kernel - Low memory Top of memory - 0 boot memory size | - | | |<--Reserved dump area -->| | - V V | Permanent Reservation | V - +-----------+----------/ /---+---+----+-----------+----+------+ - | | |CPU|HPTE| DUMP |ELF | | - +-----------+----------/ /---+---+----+-----------+----+------+ - | ^ - | | - \ / - ------------------------------------------- - Boot memory content gets transferred to - reserved area by firmware at the time of - crash + Low memory Top of memory + 0 boot memory size |<--Reserved dump area --->| | + | | | (Permanent Reservation) | | + V V | | V + +-----------+----------/ /---+---+----+--------+---+----+------+ + | | |CPU|HPTE| DUMP |HDR|ELF | | + +-----------+----------/ /---+---+----+--------+---+----+------+ + | ^ ^ + | | | + \ / | + ----------------------------------- FADump Header + Boot memory content gets transferred (meta area) + to reserved area by firmware at the + time of crash + Fig. 1 + o Memory Reservation during second kernel after crash - Low memory Top of memory - 0 boot memory size | - | |<------------- Reserved dump area ----------- -->| - V V V - +-----------+----------/ /---+---+----+-----------+----+------+ - | | |CPU|HPTE| DUMP |ELF | | - +-----------+----------/ /---+---+----+-----------+----+------+ + Low memory Top of memory + 0 boot memory size | + | |<----------- Crash preserved area --------------->| + V V |<-- Reserved dump area -->| V + +-----------+----------/ /---+---+----+--------+---+----+------+ + | | |CPU|HPTE| DUMP |HDR|ELF | | + +-----------+----------/ /---+---+----+--------+---+----+------+ | | V V Used by second /proc/vmcore kernel to boot Fig. 2 -Currently the dump will be copied from /proc/vmcore to a -a new file upon user intervention. The dump data available through -/proc/vmcore will be in ELF format. Hence the existing kdump -infrastructure (kdump scripts) to save the dump works fine with -minor modifications. +Currently the dump will be copied from /proc/vmcore to a new file upon +user intervention. The dump data available through /proc/vmcore will be +in ELF format. Hence the existing kdump infrastructure (kdump scripts) +to save the dump works fine with minor modifications. KDump scripts on +major Distro releases have already been modified to work seemlessly (no +user intervention in saving the dump) when FADump is used, instead of +KDump, as dump mechanism. The tools to examine the dump will be same as the ones used for kdump. -- cgit v1.2.3 From 72aa651795f0e9f48bfdb2b2dd0b3e6900351d2a Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:17:56 +0530 Subject: powerpc/fadump: use helper functions to reserve/release cpu notes buffer Use helper functions to simplify memory allocation, pinning down and freeing the memory used for CPU notes buffer. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821323555.5656.2486038022572739622.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index eb0745e418db..994fc09e9cbf 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -786,33 +786,24 @@ void fadump_update_elfcore_header(char *bufp) static void *fadump_alloc_buffer(unsigned long size) { - void *vaddr; + unsigned long count, i; struct page *page; - unsigned long order, count, i; + void *vaddr; - order = get_order(size); - vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); if (!vaddr) return NULL; - count = 1 << order; + count = PAGE_ALIGN(size) / PAGE_SIZE; page = virt_to_page(vaddr); for (i = 0; i < count; i++) - SetPageReserved(page + i); + mark_page_reserved(page + i); return vaddr; } static void fadump_free_buffer(unsigned long vaddr, unsigned long size) { - struct page *page; - unsigned long order, count, i; - - order = get_order(size); - count = 1 << order; - page = virt_to_page(vaddr); - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); + free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL); } s32 fadump_setup_cpu_notes_buf(u32 num_cpus) -- cgit v1.2.3 From 0226e55275e569126882a7befe0b1a1c9bd270aa Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:18:14 +0530 Subject: powerpc/fadump: move rtas specific definitions to platform code Currently, FADump is only supported on pSeries but that is going to change soon with FADump support being added on PowerNV platform. So, move rtas specific definitions to platform code to allow FADump to have multiple platforms support. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821328494.5656.16219929140866195511.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 20 ++++- arch/powerpc/include/asm/fadump.h | 116 +-------------------------- arch/powerpc/kernel/fadump.c | 96 +++++++++++----------- arch/powerpc/platforms/pseries/rtas-fadump.h | 100 +++++++++++++++++++++++ 4 files changed, 172 insertions(+), 160 deletions(-) create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.h diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index e004c89570b0..f8097510e03f 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -43,7 +43,25 @@ #define FADUMP_UNREGISTER 2 #define FADUMP_INVALIDATE 3 -#define FADUMP_CRASH_INFO_MAGIC str_to_u64("FADMPINF") +/* + * Copy the ascii values for first 8 characters from a string into u64 + * variable at their respective indexes. + * e.g. + * The string "FADMPINF" will be converted into 0x4641444d50494e46 + */ +static inline u64 fadump_str_to_u64(const char *str) +{ + u64 val = 0; + int i; + + for (i = 0; i < sizeof(val); i++) + val = (*str) ? (val << 8) | *str++ : val << 8; + return val; +} + +#define FADUMP_CPU_UNKNOWN (~((u32)0)) + +#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPINF") /* fadump crash info structure */ struct fadump_crash_info_header { diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 751794978427..fd5002b4c18c 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -6,121 +6,13 @@ * Author: Mahesh Salgaonkar */ -#ifndef __PPC64_FA_DUMP_H__ -#define __PPC64_FA_DUMP_H__ +#ifndef _ASM_POWERPC_FADUMP_H +#define _ASM_POWERPC_FADUMP_H #ifdef CONFIG_FA_DUMP -/* Firmware provided dump sections */ -#define FADUMP_CPU_STATE_DATA 0x0001 -#define FADUMP_HPTE_REGION 0x0002 -#define FADUMP_REAL_MODE_REGION 0x0011 - -/* Dump request flag */ -#define FADUMP_REQUEST_FLAG 0x00000001 - -/* Dump status flag */ -#define FADUMP_ERROR_FLAG 0x2000 - -#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1) - -#define CPU_UNKNOWN (~((u32)0)) - -/* Utility macros */ -#define SKIP_TO_NEXT_CPU(reg_entry) \ -({ \ - while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) \ - reg_entry++; \ - reg_entry++; \ -}) - extern int crashing_cpu; -/* Kernel Dump section info */ -struct fadump_section { - __be32 request_flag; - __be16 source_data_type; - __be16 error_flags; - __be64 source_address; - __be64 source_len; - __be64 bytes_dumped; - __be64 destination_address; -}; - -/* ibm,configure-kernel-dump header. */ -struct fadump_section_header { - __be32 dump_format_version; - __be16 dump_num_sections; - __be16 dump_status_flag; - __be32 offset_first_dump_section; - - /* Fields for disk dump option. */ - __be32 dd_block_size; - __be64 dd_block_offset; - __be64 dd_num_blocks; - __be32 dd_offset_disk_path; - - /* Maximum time allowed to prevent an automatic dump-reboot. */ - __be32 max_time_auto; -}; - -/* - * Firmware Assisted dump memory structure. This structure is required for - * registering future kernel dump with power firmware through rtas call. - * - * No disk dump option. Hence disk dump path string section is not included. - */ -struct fadump_mem_struct { - struct fadump_section_header header; - - /* Kernel dump sections */ - struct fadump_section cpu_state_data; - struct fadump_section hpte_region; - struct fadump_section rmr_region; -}; - -/* - * Copy the ascii values for first 8 characters from a string into u64 - * variable at their respective indexes. - * e.g. - * The string "FADMPINF" will be converted into 0x4641444d50494e46 - */ -static inline u64 str_to_u64(const char *str) -{ - u64 val = 0; - int i; - - for (i = 0; i < sizeof(val); i++) - val = (*str) ? (val << 8) | *str++ : val << 8; - return val; -} -#define STR_TO_HEX(x) str_to_u64(x) -#define REG_ID(x) str_to_u64(x) - -#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") - -/* The firmware-assisted dump format. - * - * The register save area is an area in the partition's memory used to preserve - * the register contents (CPU state data) for the active CPUs during a firmware - * assisted dump. The dump format contains register save area header followed - * by register entries. Each list of registers for a CPU starts with - * "CPUSTRT" and ends with "CPUEND". - */ - -/* Register save area header. */ -struct fadump_reg_save_area_header { - __be64 magic_number; - __be32 version; - __be32 num_cpu_offset; -}; - -/* Register entry. */ -struct fadump_reg_entry { - __be64 reg_id; - __be64 reg_value; -}; - extern int is_fadump_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); @@ -136,5 +28,5 @@ static inline int is_fadump_active(void) { return 0; } static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } static inline void fadump_cleanup(void) { } -#endif -#endif +#endif /* !CONFIG_FA_DUMP */ +#endif /* _ASM_POWERPC_FADUMP_H */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 994fc09e9cbf..03f2708cd954 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -33,12 +33,11 @@ #include #include +#include "../platforms/pseries/rtas-fadump.h" + static struct fw_dump fw_dump; -static struct fadump_mem_struct fdm; -static const struct fadump_mem_struct *fdm_active; -#ifdef CONFIG_CMA -static struct cma *fadump_cma; -#endif +static struct rtas_fadump_mem_struct fdm; +static const struct rtas_fadump_mem_struct *fdm_active; static DEFINE_MUTEX(fadump_mutex); struct fad_crash_memory_ranges *crash_memory_ranges; @@ -47,6 +46,8 @@ int crash_mem_ranges; int max_crash_mem_ranges; #ifdef CONFIG_CMA +static struct cma *fadump_cma; + /* * fadump_cma_init() - Initialize CMA area from a fadump reserved memory * @@ -156,11 +157,11 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, u32 type = (u32)of_read_number(sections, 1); switch (type) { - case FADUMP_CPU_STATE_DATA: + case RTAS_FADUMP_CPU_STATE_DATA: fw_dump.cpu_state_data_size = of_read_ulong(§ions[1], 2); break; - case FADUMP_HPTE_REGION: + case RTAS_FADUMP_HPTE_REGION: fw_dump.hpte_region_size = of_read_ulong(§ions[1], 2); break; @@ -271,20 +272,20 @@ static void fadump_show_config(void) pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); } -static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, +static unsigned long init_fadump_mem_struct(struct rtas_fadump_mem_struct *fdm, unsigned long addr) { if (!fdm) return 0; - memset(fdm, 0, sizeof(struct fadump_mem_struct)); + memset(fdm, 0, sizeof(struct rtas_fadump_mem_struct)); addr = addr & PAGE_MASK; fdm->header.dump_format_version = cpu_to_be32(0x00000001); fdm->header.dump_num_sections = cpu_to_be16(3); fdm->header.dump_status_flag = 0; fdm->header.offset_first_dump_section = - cpu_to_be32((u32)offsetof(struct fadump_mem_struct, cpu_state_data)); + cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, cpu_state_data)); /* * Fields for disk dump option. @@ -300,24 +301,24 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, /* Kernel dump sections */ /* cpu state data section. */ - fdm->cpu_state_data.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->cpu_state_data.source_data_type = cpu_to_be16(FADUMP_CPU_STATE_DATA); + fdm->cpu_state_data.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm->cpu_state_data.source_data_type = cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); fdm->cpu_state_data.source_address = 0; fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size); fdm->cpu_state_data.destination_address = cpu_to_be64(addr); addr += fw_dump.cpu_state_data_size; /* hpte region section */ - fdm->hpte_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->hpte_region.source_data_type = cpu_to_be16(FADUMP_HPTE_REGION); + fdm->hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm->hpte_region.source_data_type = cpu_to_be16(RTAS_FADUMP_HPTE_REGION); fdm->hpte_region.source_address = 0; fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size); fdm->hpte_region.destination_address = cpu_to_be64(addr); addr += fw_dump.hpte_region_size; /* RMA region section */ - fdm->rmr_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->rmr_region.source_data_type = cpu_to_be16(FADUMP_REAL_MODE_REGION); + fdm->rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm->rmr_region.source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); fdm->rmr_region.source_address = cpu_to_be64(RMA_START); fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size); fdm->rmr_region.destination_address = cpu_to_be64(addr); @@ -588,7 +589,7 @@ static int __init early_fadump_reserve_mem(char *p) } early_param("fadump_reserve_mem", early_fadump_reserve_mem); -static int register_fw_dump(struct fadump_mem_struct *fdm) +static int register_fw_dump(struct rtas_fadump_mem_struct *fdm) { int rc, err; unsigned int wait_time; @@ -599,7 +600,7 @@ static int register_fw_dump(struct fadump_mem_struct *fdm) do { rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, FADUMP_REGISTER, fdm, - sizeof(struct fadump_mem_struct)); + sizeof(struct rtas_fadump_mem_struct)); wait_time = rtas_busy_delay_time(rc); if (wait_time) @@ -695,7 +696,7 @@ static inline int fadump_gpr_index(u64 id) int i = -1; char str[3]; - if ((id & GPR_MASK) == REG_ID("GPR")) { + if ((id & GPR_MASK) == fadump_str_to_u64("GPR")) { /* get the digits at the end */ id &= ~GPR_MASK; id >>= 24; @@ -717,30 +718,30 @@ static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, i = fadump_gpr_index(reg_id); if (i >= 0) regs->gpr[i] = (unsigned long)reg_val; - else if (reg_id == REG_ID("NIA")) + else if (reg_id == fadump_str_to_u64("NIA")) regs->nip = (unsigned long)reg_val; - else if (reg_id == REG_ID("MSR")) + else if (reg_id == fadump_str_to_u64("MSR")) regs->msr = (unsigned long)reg_val; - else if (reg_id == REG_ID("CTR")) + else if (reg_id == fadump_str_to_u64("CTR")) regs->ctr = (unsigned long)reg_val; - else if (reg_id == REG_ID("LR")) + else if (reg_id == fadump_str_to_u64("LR")) regs->link = (unsigned long)reg_val; - else if (reg_id == REG_ID("XER")) + else if (reg_id == fadump_str_to_u64("XER")) regs->xer = (unsigned long)reg_val; - else if (reg_id == REG_ID("CR")) + else if (reg_id == fadump_str_to_u64("CR")) regs->ccr = (unsigned long)reg_val; - else if (reg_id == REG_ID("DAR")) + else if (reg_id == fadump_str_to_u64("DAR")) regs->dar = (unsigned long)reg_val; - else if (reg_id == REG_ID("DSISR")) + else if (reg_id == fadump_str_to_u64("DSISR")) regs->dsisr = (unsigned long)reg_val; } -static struct fadump_reg_entry* -fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) +static struct rtas_fadump_reg_entry* +fadump_read_registers(struct rtas_fadump_reg_entry *reg_entry, struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) { + while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) { fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), be64_to_cpu(reg_entry->reg_value)); reg_entry++; @@ -850,10 +851,10 @@ void fadump_free_cpu_notes_buf(void) * state from fadump crash info structure populated by first kernel at the * time of crash. */ -static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) +static int __init fadump_build_cpu_notes(const struct rtas_fadump_mem_struct *fdm) { - struct fadump_reg_save_area_header *reg_header; - struct fadump_reg_entry *reg_entry; + struct rtas_fadump_reg_save_area_header *reg_header; + struct rtas_fadump_reg_entry *reg_entry; struct fadump_crash_info_header *fdh = NULL; void *vaddr; unsigned long addr; @@ -868,7 +869,8 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) vaddr = __va(addr); reg_header = vaddr; - if (be64_to_cpu(reg_header->magic_number) != REGSAVE_AREA_MAGIC) { + if (be64_to_cpu(reg_header->magic_number) != + fadump_str_to_u64("REGSAVE")) { printk(KERN_ERR "Unable to read register save area.\n"); return -ENOENT; } @@ -880,7 +882,7 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); pr_debug("NumCpus : %u\n", num_cpus); vaddr += sizeof(u32); - reg_entry = (struct fadump_reg_entry *)vaddr; + reg_entry = (struct rtas_fadump_reg_entry *)vaddr; rc = fadump_setup_cpu_notes_buf(num_cpus); if (rc != 0) @@ -892,22 +894,22 @@ static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) fdh = __va(fw_dump.fadumphdr_addr); for (i = 0; i < num_cpus; i++) { - if (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUSTRT")) { + if (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUSTRT")) { printk(KERN_ERR "Unable to read CPU state data\n"); rc = -ENOENT; goto error_out; } /* Lower 4 bytes of reg_value contains logical cpu id */ - cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK; + cpu = be64_to_cpu(reg_entry->reg_value) & RTAS_FADUMP_CPU_ID_MASK; if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { - SKIP_TO_NEXT_CPU(reg_entry); + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); continue; } pr_debug("Reading register data for cpu %d...\n", cpu); if (fdh && fdh->crashing_cpu == cpu) { regs = fdh->regs; note_buf = fadump_regs_to_elf_notes(note_buf, ®s); - SKIP_TO_NEXT_CPU(reg_entry); + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); } else { reg_entry++; reg_entry = fadump_read_registers(reg_entry, ®s); @@ -933,7 +935,7 @@ error_out: * Validate and process the dump data stored by firmware before exporting * it through '/proc/vmcore'. */ -static int __init process_fadump(const struct fadump_mem_struct *fdm_active) +static int __init process_fadump(const struct rtas_fadump_mem_struct *fdm_active) { struct fadump_crash_info_header *fdh; int rc = 0; @@ -942,7 +944,7 @@ static int __init process_fadump(const struct fadump_mem_struct *fdm_active) return -EINVAL; /* Check if the dump data is valid. */ - if ((be16_to_cpu(fdm_active->header.dump_status_flag) == FADUMP_ERROR_FLAG) || + if ((be16_to_cpu(fdm_active->header.dump_status_flag) == RTAS_FADUMP_ERROR_FLAG) || (fdm_active->cpu_state_data.error_flags != 0) || (fdm_active->rmr_region.error_flags != 0)) { printk(KERN_ERR "Dump taken by platform is not valid\n"); @@ -1273,7 +1275,7 @@ static unsigned long init_fadump_header(unsigned long addr) fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; fdh->elfcorehdr_addr = addr; /* We will set the crashing cpu id in crash_fadump() during crash. */ - fdh->crashing_cpu = CPU_UNKNOWN; + fdh->crashing_cpu = FADUMP_CPU_UNKNOWN; return addr; } @@ -1307,7 +1309,7 @@ static int register_fadump(void) return register_fw_dump(&fdm); } -static int fadump_unregister_dump(struct fadump_mem_struct *fdm) +static int fadump_unregister_dump(struct rtas_fadump_mem_struct *fdm) { int rc = 0; unsigned int wait_time; @@ -1318,7 +1320,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm) do { rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, FADUMP_UNREGISTER, fdm, - sizeof(struct fadump_mem_struct)); + sizeof(struct rtas_fadump_mem_struct)); wait_time = rtas_busy_delay_time(rc); if (wait_time) @@ -1334,7 +1336,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm) return 0; } -static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm) +static int fadump_invalidate_dump(const struct rtas_fadump_mem_struct *fdm) { int rc = 0; unsigned int wait_time; @@ -1345,7 +1347,7 @@ static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm) do { rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, FADUMP_INVALIDATE, fdm, - sizeof(struct fadump_mem_struct)); + sizeof(struct rtas_fadump_mem_struct)); wait_time = rtas_busy_delay_time(rc); if (wait_time) @@ -1561,7 +1563,7 @@ unlock_out: static int fadump_region_show(struct seq_file *m, void *private) { - const struct fadump_mem_struct *fdm_ptr; + const struct rtas_fadump_mem_struct *fdm_ptr; if (!fw_dump.fadump_enabled) return 0; diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h new file mode 100644 index 000000000000..531f3f3e42b3 --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _PSERIES_RTAS_FADUMP_H +#define _PSERIES_RTAS_FADUMP_H + +/* Firmware provided dump sections */ +#define RTAS_FADUMP_CPU_STATE_DATA 0x0001 +#define RTAS_FADUMP_HPTE_REGION 0x0002 +#define RTAS_FADUMP_REAL_MODE_REGION 0x0011 + +/* Dump request flag */ +#define RTAS_FADUMP_REQUEST_FLAG 0x00000001 + +/* Dump status flag */ +#define RTAS_FADUMP_ERROR_FLAG 0x2000 + +/* Kernel Dump section info */ +struct rtas_fadump_section { + __be32 request_flag; + __be16 source_data_type; + __be16 error_flags; + __be64 source_address; + __be64 source_len; + __be64 bytes_dumped; + __be64 destination_address; +}; + +/* ibm,configure-kernel-dump header. */ +struct rtas_fadump_section_header { + __be32 dump_format_version; + __be16 dump_num_sections; + __be16 dump_status_flag; + __be32 offset_first_dump_section; + + /* Fields for disk dump option. */ + __be32 dd_block_size; + __be64 dd_block_offset; + __be64 dd_num_blocks; + __be32 dd_offset_disk_path; + + /* Maximum time allowed to prevent an automatic dump-reboot. */ + __be32 max_time_auto; +}; + +/* + * Firmware Assisted dump memory structure. This structure is required for + * registering future kernel dump with power firmware through rtas call. + * + * No disk dump option. Hence disk dump path string section is not included. + */ +struct rtas_fadump_mem_struct { + struct rtas_fadump_section_header header; + + /* Kernel dump sections */ + struct rtas_fadump_section cpu_state_data; + struct rtas_fadump_section hpte_region; + struct rtas_fadump_section rmr_region; +}; + +/* + * The firmware-assisted dump format. + * + * The register save area is an area in the partition's memory used to preserve + * the register contents (CPU state data) for the active CPUs during a firmware + * assisted dump. The dump format contains register save area header followed + * by register entries. Each list of registers for a CPU starts with "CPUSTRT" + * and ends with "CPUEND". + */ + +/* Register save area header. */ +struct rtas_fadump_reg_save_area_header { + __be64 magic_number; + __be32 version; + __be32 num_cpu_offset; +}; + +/* Register entry. */ +struct rtas_fadump_reg_entry { + __be64 reg_id; + __be64 reg_value; +}; + +/* Utility macros */ +#define RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry) \ +({ \ + while (be64_to_cpu(reg_entry->reg_id) != \ + fadump_str_to_u64("CPUEND")) \ + reg_entry++; \ + reg_entry++; \ +}) + +#define RTAS_FADUMP_CPU_ID_MASK ((1UL << 32) - 1) + +#endif /* _PSERIES_RTAS_FADUMP_H */ -- cgit v1.2.3 From d3833a7010817f82bff373e26d146e6401c695f4 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:18:40 +0530 Subject: powerpc/fadump: introduce callbacks for platform specific operations Introduce callback functions for platform specific operations like register, unregister, invalidate & such. Also, define place-holders for the same on pSeries platform. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821330286.5656.15538934400074110770.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 24 ++++++ arch/powerpc/kernel/fadump.c | 45 +--------- arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/rtas-fadump.c | 122 +++++++++++++++++++++++++++ 4 files changed, 148 insertions(+), 44 deletions(-) create mode 100644 arch/powerpc/platforms/pseries/rtas-fadump.c diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index f8097510e03f..8ecd23421b6c 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -77,6 +77,9 @@ struct fad_crash_memory_ranges { unsigned long long size; }; +/* Platform specific callback functions */ +struct fadump_ops; + /* Firmware-assisted dump configuration details. */ struct fw_dump { unsigned long reserve_dump_area_start; @@ -99,6 +102,20 @@ struct fw_dump { unsigned long dump_active:1; unsigned long dump_registered:1; unsigned long nocma:1; + + struct fadump_ops *ops; +}; + +struct fadump_ops { + u64 (*fadump_init_mem_struct)(struct fw_dump *fadump_conf); + int (*fadump_register)(struct fw_dump *fadump_conf); + int (*fadump_unregister)(struct fw_dump *fadump_conf); + int (*fadump_invalidate)(struct fw_dump *fadump_conf); + int (*fadump_process)(struct fw_dump *fadump_conf); + void (*fadump_region_show)(struct fw_dump *fadump_conf, + struct seq_file *m); + void (*fadump_trigger)(struct fadump_crash_info_header *fdh, + const char *msg); }; /* Helper functions */ @@ -109,4 +126,11 @@ void fadump_update_elfcore_header(char *bufp); bool is_fadump_boot_mem_contiguous(void); bool is_fadump_reserved_mem_contiguous(void); +#ifdef CONFIG_PPC_PSERIES +extern void rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node); +#else +static inline void +rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { } +#endif + #endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 03f2708cd954..aa342ee53acb 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -112,24 +112,10 @@ static int __init fadump_cma_init(void) { return 1; } int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { - const __be32 *sections; - int i, num_sections; - int size; - const __be32 *token; - if (depth != 1 || strcmp(uname, "rtas") != 0) return 0; - /* - * Check if Firmware Assisted dump is supported. if yes, check - * if dump has been initiated on last reboot. - */ - token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); - if (!token) - return 1; - - fw_dump.fadump_supported = 1; - fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token); + rtas_fadump_dt_scan(&fw_dump, node); /* * The 'ibm,kernel-dump' rtas node is present only if there is @@ -139,35 +125,6 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, if (fdm_active) fw_dump.dump_active = 1; - /* Get the sizes required to store dump data for the firmware provided - * dump sections. - * For each dump section type supported, a 32bit cell which defines - * the ID of a supported section followed by two 32 bit cells which - * gives teh size of the section in bytes. - */ - sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", - &size); - - if (!sections) - return 1; - - num_sections = size / (3 * sizeof(u32)); - - for (i = 0; i < num_sections; i++, sections += 3) { - u32 type = (u32)of_read_number(sections, 1); - - switch (type) { - case RTAS_FADUMP_CPU_STATE_DATA: - fw_dump.cpu_state_data_size = - of_read_ulong(§ions[1], 2); - break; - case RTAS_FADUMP_HPTE_REGION: - fw_dump.hpte_region_size = - of_read_ulong(§ions[1], 2); - break; - } - } - return 1; } diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index a420ef4c9d8e..a3c74a5cf20d 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_SVM) += svm.o +obj-$(CONFIG_FA_DUMP) += rtas-fadump.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c new file mode 100644 index 000000000000..9efe7b1ac9cc --- /dev/null +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWERVM platform. + * + * Copyright 2011, Mahesh Salgaonkar, IBM Corporation. + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "rtas fadump: " fmt + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "rtas-fadump.h" + +static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + return fadump_conf->reserve_dump_area_start; +} + +static int rtas_fadump_register(struct fw_dump *fadump_conf) +{ + return -EIO; +} + +static int rtas_fadump_unregister(struct fw_dump *fadump_conf) +{ + return -EIO; +} + +static int rtas_fadump_invalidate(struct fw_dump *fadump_conf) +{ + return -EIO; +} + +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init rtas_fadump_process(struct fw_dump *fadump_conf) +{ + return -EINVAL; +} + +static void rtas_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ +} + +static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)msg); +} + +static struct fadump_ops rtas_fadump_ops = { + .fadump_init_mem_struct = rtas_fadump_init_mem_struct, + .fadump_register = rtas_fadump_register, + .fadump_unregister = rtas_fadump_unregister, + .fadump_invalidate = rtas_fadump_invalidate, + .fadump_process = rtas_fadump_process, + .fadump_region_show = rtas_fadump_region_show, + .fadump_trigger = rtas_fadump_trigger, +}; + +void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + int i, size, num_sections; + const __be32 *sections; + const __be32 *token; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return; + + fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); + fadump_conf->ops = &rtas_fadump_ops; + fadump_conf->fadump_supported = 1; + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives the size of the section in bytes. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return; + + num_sections = size / (3 * sizeof(u32)); + + for (i = 0; i < num_sections; i++, sections += 3) { + u32 type = (u32)of_read_number(sections, 1); + + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + fadump_conf->cpu_state_data_size = + of_read_ulong(§ions[1], 2); + break; + case RTAS_FADUMP_HPTE_REGION: + fadump_conf->hpte_region_size = + of_read_ulong(§ions[1], 2); + break; + } + } +} -- cgit v1.2.3 From 41a65d1618238e63be1439871eaf44dc3c6a737c Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:18:57 +0530 Subject: pseries/fadump: define RTAS register/un-register callback functions Move platform specific register/un-register code, the RTAS calls, to register/un-register callback functions. This would also mean moving code that initializes and prints the platform specific FADump data. Signed-off-by: Hari Bathini Reviewed-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821332856.5656.16380417702046411631.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 2 + arch/powerpc/kernel/fadump.c | 165 +++------------------------ arch/powerpc/platforms/pseries/rtas-fadump.c | 162 +++++++++++++++++++++++++- 3 files changed, 176 insertions(+), 153 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 8ecd23421b6c..8c0eb0fc09d0 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -89,7 +89,9 @@ struct fw_dump { unsigned long cpu_state_data_size; unsigned long hpte_region_size; + unsigned long boot_memory_size; + u64 boot_mem_dest_addr; unsigned long fadumphdr_addr; unsigned long cpu_notes_buf_vaddr; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index aa342ee53acb..3cf621080427 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -36,7 +36,6 @@ #include "../platforms/pseries/rtas-fadump.h" static struct fw_dump fw_dump; -static struct rtas_fadump_mem_struct fdm; static const struct rtas_fadump_mem_struct *fdm_active; static DEFINE_MUTEX(fadump_mutex); @@ -229,61 +228,6 @@ static void fadump_show_config(void) pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); } -static unsigned long init_fadump_mem_struct(struct rtas_fadump_mem_struct *fdm, - unsigned long addr) -{ - if (!fdm) - return 0; - - memset(fdm, 0, sizeof(struct rtas_fadump_mem_struct)); - addr = addr & PAGE_MASK; - - fdm->header.dump_format_version = cpu_to_be32(0x00000001); - fdm->header.dump_num_sections = cpu_to_be16(3); - fdm->header.dump_status_flag = 0; - fdm->header.offset_first_dump_section = - cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, cpu_state_data)); - - /* - * Fields for disk dump option. - * We are not using disk dump option, hence set these fields to 0. - */ - fdm->header.dd_block_size = 0; - fdm->header.dd_block_offset = 0; - fdm->header.dd_num_blocks = 0; - fdm->header.dd_offset_disk_path = 0; - - /* set 0 to disable an automatic dump-reboot. */ - fdm->header.max_time_auto = 0; - - /* Kernel dump sections */ - /* cpu state data section. */ - fdm->cpu_state_data.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm->cpu_state_data.source_data_type = cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); - fdm->cpu_state_data.source_address = 0; - fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size); - fdm->cpu_state_data.destination_address = cpu_to_be64(addr); - addr += fw_dump.cpu_state_data_size; - - /* hpte region section */ - fdm->hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm->hpte_region.source_data_type = cpu_to_be16(RTAS_FADUMP_HPTE_REGION); - fdm->hpte_region.source_address = 0; - fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size); - fdm->hpte_region.destination_address = cpu_to_be64(addr); - addr += fw_dump.hpte_region_size; - - /* RMA region section */ - fdm->rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm->rmr_region.source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); - fdm->rmr_region.source_address = cpu_to_be64(RMA_START); - fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size); - fdm->rmr_region.destination_address = cpu_to_be64(addr); - addr += fw_dump.boot_memory_size; - - return addr; -} - /** * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM * @@ -546,61 +490,6 @@ static int __init early_fadump_reserve_mem(char *p) } early_param("fadump_reserve_mem", early_fadump_reserve_mem); -static int register_fw_dump(struct rtas_fadump_mem_struct *fdm) -{ - int rc, err; - unsigned int wait_time; - - pr_debug("Registering for firmware-assisted kernel dump...\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_REGISTER, fdm, - sizeof(struct rtas_fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - - } while (wait_time); - - err = -EIO; - switch (rc) { - default: - pr_err("Failed to register. Unknown Error(%d).\n", rc); - break; - case -1: - printk(KERN_ERR "Failed to register firmware-assisted kernel" - " dump. Hardware Error(%d).\n", rc); - break; - case -3: - if (!is_fadump_boot_mem_contiguous()) - pr_err("Can't have holes in boot memory area while registering fadump\n"); - else if (!is_fadump_reserved_mem_contiguous()) - pr_err("Can't have holes in reserved memory area while" - " registering fadump\n"); - - printk(KERN_ERR "Failed to register firmware-assisted kernel" - " dump. Parameter Error(%d).\n", rc); - err = -EINVAL; - break; - case -9: - printk(KERN_ERR "firmware-assisted kernel dump is already " - " registered."); - fw_dump.dump_registered = 1; - err = -EEXIST; - break; - case 0: - printk(KERN_INFO "firmware-assisted kernel dump registration" - " is successful\n"); - fw_dump.dump_registered = 1; - err = 0; - break; - } - return err; -} - void crash_fadump(struct pt_regs *regs, const char *str) { struct fadump_crash_info_header *fdh = NULL; @@ -643,8 +532,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) fdh->online_mask = *cpu_online_mask; - /* Call ibm,os-term rtas call to trigger firmware assisted dump */ - rtas_os_term((char *)str); + fw_dump.ops->fadump_trigger(fdh, str); } #define GPR_MASK 0xffffff0000000000 @@ -1129,7 +1017,7 @@ static int fadump_setup_crash_memory_ranges(void) static inline unsigned long fadump_relocate(unsigned long paddr) { if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) - return be64_to_cpu(fdm.rmr_region.destination_address) + paddr; + return fw_dump.boot_mem_dest_addr + paddr; else return paddr; } @@ -1202,7 +1090,7 @@ static int fadump_create_elfcore_headers(char *bufp) * to the specified destination_address. Hence set * the correct offset. */ - phdr->p_offset = be64_to_cpu(fdm.rmr_region.destination_address); + phdr->p_offset = fw_dump.boot_mem_dest_addr; } phdr->p_paddr = mbase; @@ -1254,7 +1142,8 @@ static int register_fadump(void) if (ret) return ret; - addr = be64_to_cpu(fdm.rmr_region.destination_address) + be64_to_cpu(fdm.rmr_region.source_len); + addr = fw_dump.fadumphdr_addr; + /* Initialize fadump crash info header. */ addr = init_fadump_header(addr); vaddr = __va(addr); @@ -1263,34 +1152,8 @@ static int register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - return register_fw_dump(&fdm); -} - -static int fadump_unregister_dump(struct rtas_fadump_mem_struct *fdm) -{ - int rc = 0; - unsigned int wait_time; - - pr_debug("Un-register firmware-assisted dump\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_UNREGISTER, fdm, - sizeof(struct rtas_fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - } while (wait_time); - - if (rc) { - printk(KERN_ERR "Failed to un-register firmware-assisted dump." - " unexpected error(%d).\n", rc); - return rc; - } - fw_dump.dump_registered = 0; - return 0; + pr_debug("Registering for firmware-assisted kernel dump...\n"); + return fw_dump.ops->fadump_register(&fw_dump); } static int fadump_invalidate_dump(const struct rtas_fadump_mem_struct *fdm) @@ -1328,7 +1191,7 @@ void fadump_cleanup(void) fadump_invalidate_dump(fdm_active); } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ - fadump_unregister_dump(&fdm); + fw_dump.ops->fadump_unregister(&fw_dump); free_crash_memory_ranges(); } } @@ -1433,7 +1296,7 @@ static void fadump_invalidate_release_mem(void) fadump_free_cpu_notes_buf(); /* Initialize the kernel dump memory structure for FAD registration. */ - init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + fw_dump.ops->fadump_init_mem_struct(&fw_dump); } static ssize_t fadump_release_memory_store(struct kobject *kobj, @@ -1498,12 +1361,13 @@ static ssize_t fadump_register_store(struct kobject *kobj, goto unlock_out; } /* Un-register Firmware-assisted dump */ - fadump_unregister_dump(&fdm); + pr_debug("Un-register firmware-assisted dump\n"); + fw_dump.ops->fadump_unregister(&fw_dump); break; case 1: if (fw_dump.dump_registered == 1) { /* Un-register Firmware-assisted dump */ - fadump_unregister_dump(&fdm); + fw_dump.ops->fadump_unregister(&fw_dump); } /* Register Firmware-assisted dump */ ret = register_fadump(); @@ -1530,7 +1394,8 @@ static int fadump_region_show(struct seq_file *m, void *private) fdm_ptr = fdm_active; else { mutex_unlock(&fadump_mutex); - fdm_ptr = &fdm; + fw_dump.ops->fadump_region_show(&fw_dump, m); + return 0; } seq_printf(m, @@ -1651,7 +1516,7 @@ int __init setup_fadump(void) } /* Initialize the kernel dump memory structure for FAD registration. */ else if (fw_dump.reserve_dump_area_size) - init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + fw_dump.ops->fadump_init_mem_struct(&fw_dump); fadump_init_files(); return 1; diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 9efe7b1ac9cc..1daa29fddcaa 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -22,19 +22,151 @@ #include "rtas-fadump.h" +static struct rtas_fadump_mem_struct fdm; + +static void rtas_fadump_update_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_mem_dest_addr = + be64_to_cpu(fdm->rmr_region.destination_address); + + fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr + + fadump_conf->boot_memory_size); +} + static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) { - return fadump_conf->reserve_dump_area_start; + u64 addr = fadump_conf->reserve_dump_area_start; + + memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm.header.dump_format_version = cpu_to_be32(0x00000001); + fdm.header.dump_num_sections = cpu_to_be16(3); + fdm.header.dump_status_flag = 0; + fdm.header.offset_first_dump_section = + cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, + cpu_state_data)); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm.header.dd_block_size = 0; + fdm.header.dd_block_offset = 0; + fdm.header.dd_num_blocks = 0; + fdm.header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm.header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm.cpu_state_data.request_flag = + cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.cpu_state_data.source_data_type = + cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); + fdm.cpu_state_data.source_address = 0; + fdm.cpu_state_data.source_len = + cpu_to_be64(fadump_conf->cpu_state_data_size); + fdm.cpu_state_data.destination_address = cpu_to_be64(addr); + addr += fadump_conf->cpu_state_data_size; + + /* hpte region section */ + fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.hpte_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_HPTE_REGION); + fdm.hpte_region.source_address = 0; + fdm.hpte_region.source_len = + cpu_to_be64(fadump_conf->hpte_region_size); + fdm.hpte_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->hpte_region_size; + + /* RMA region section */ + fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rmr_region.source_data_type = + cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); + fdm.rmr_region.source_address = cpu_to_be64(RMA_START); + fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size); + fdm.rmr_region.destination_address = cpu_to_be64(addr); + addr += fadump_conf->boot_memory_size; + + rtas_fadump_update_config(fadump_conf, &fdm); + + return addr; } static int rtas_fadump_register(struct fw_dump *fadump_conf) { - return -EIO; + unsigned int wait_time; + int rc, err = -EIO; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_REGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case 0: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case -1: + pr_err("Failed to register. Hardware Error(%d).\n", rc); + break; + case -3: + if (!is_fadump_boot_mem_contiguous()) + pr_err("Can't have holes in boot memory area.\n"); + else if (!is_fadump_reserved_mem_contiguous()) + pr_err("Can't have holes in reserved memory area.\n"); + + pr_err("Failed to register. Parameter Error(%d).\n", rc); + err = -EINVAL; + break; + case -9: + pr_err("Already registered!\n"); + fadump_conf->dump_registered = 1; + err = -EEXIST; + break; + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; + } + + return err; } static int rtas_fadump_unregister(struct fw_dump *fadump_conf) { - return -EIO; + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_UNREGISTER, &fdm, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to un-register - unexpected error(%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_registered = 0; + return 0; } static int rtas_fadump_invalidate(struct fw_dump *fadump_conf) @@ -54,6 +186,30 @@ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) static void rtas_fadump_region_show(struct fw_dump *fadump_conf, struct seq_file *m) { + const struct rtas_fadump_mem_struct *fdm_ptr = &fdm; + const struct rtas_fadump_section *cpu_data_section; + + cpu_data_section = &(fdm_ptr->cpu_state_data); + seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(cpu_data_section->destination_address), + be64_to_cpu(cpu_data_section->destination_address) + + be64_to_cpu(cpu_data_section->source_len) - 1, + be64_to_cpu(cpu_data_section->source_len), + be64_to_cpu(cpu_data_section->bytes_dumped)); + + seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->hpte_region.destination_address), + be64_to_cpu(fdm_ptr->hpte_region.destination_address) + + be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, + be64_to_cpu(fdm_ptr->hpte_region.source_len), + be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); + + seq_printf(m, "DUMP: [%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->rmr_region.destination_address), + be64_to_cpu(fdm_ptr->rmr_region.destination_address) + + be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1, + be64_to_cpu(fdm_ptr->rmr_region.source_len), + be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); } static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, -- cgit v1.2.3 From 109f25cc5fae5f8fb403d925f7b3731f6f18218a Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:19:12 +0530 Subject: powerpc/fadump: add source info while displaying region contents Improve how fadump_region contents are displayed by adding source information of memory regions that are to be dumped by f/w. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821334740.5656.5897097884010195405.stgit@hbathini.in.ibm.com --- arch/powerpc/platforms/pseries/rtas-fadump.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 1daa29fddcaa..a11e3c2458d2 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -204,10 +204,10 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf, be64_to_cpu(fdm_ptr->hpte_region.source_len), be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); - seq_printf(m, "DUMP: [%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->rmr_region.destination_address), - be64_to_cpu(fdm_ptr->rmr_region.destination_address) + - be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1, + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + be64_to_cpu(fdm_ptr->rmr_region.source_address), + be64_to_cpu(fdm_ptr->rmr_region.destination_address)); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", be64_to_cpu(fdm_ptr->rmr_region.source_len), be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); } -- cgit v1.2.3 From 8255da95e54519bb74638c2448ac17f4b34fe6f5 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:19:28 +0530 Subject: powerpc/fadump: release all the memory above boot memory size Except for Reserved dump area (see Documentation/powerpc/firmware- assisted-dump.rst) which is permanent reserved, all memory above boot memory size, where boot memory size is the memory required for the kernel to boot successfully when booted with restricted memory (memory for capture kernel), is released when the dump is invalidated. Make this a bit more explicit in the code. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821336092.5656.1079046285366041687.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3cf621080427..bff909151b44 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -391,6 +391,8 @@ int __init fadump_reserve_mem(void) else memory_boundary = memblock_end_of_DRAM(); + size = get_fadump_area_size(); + fw_dump.reserve_dump_area_size = size; if (fw_dump.dump_active) { pr_info("Firmware-assisted dump is active.\n"); @@ -416,11 +418,14 @@ int __init fadump_reserve_mem(void) be64_to_cpu(fdm_active->rmr_region.destination_address) + be64_to_cpu(fdm_active->rmr_region.source_len); pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr); - fw_dump.reserve_dump_area_start = base; - fw_dump.reserve_dump_area_size = size; - } else { - size = get_fadump_area_size(); + /* + * Start address of reserve dump area (permanent reservation) + * for re-registering FADump after dump capture. + */ + fw_dump.reserve_dump_area_start = + be64_to_cpu(fdm_active->cpu_state_data.destination_address); + } else { /* * Reserve memory at an offset closer to bottom of the RAM to * minimize the impact of memory hot-remove operation. We can't @@ -447,7 +452,6 @@ int __init fadump_reserve_mem(void) (unsigned long)(memblock_phys_mem_size() >> 20)); fw_dump.reserve_dump_area_start = base; - fw_dump.reserve_dump_area_size = size; return fadump_cma_init(); } return 1; @@ -1265,34 +1269,16 @@ static void fadump_release_memory(unsigned long begin, unsigned long end) static void fadump_invalidate_release_mem(void) { - unsigned long reserved_area_start, reserved_area_end; - unsigned long destination_address; - mutex_lock(&fadump_mutex); if (!fw_dump.dump_active) { mutex_unlock(&fadump_mutex); return; } - destination_address = be64_to_cpu(fdm_active->cpu_state_data.destination_address); fadump_cleanup(); mutex_unlock(&fadump_mutex); - /* - * Save the current reserved memory bounds we will require them - * later for releasing the memory for general use. - */ - reserved_area_start = fw_dump.reserve_dump_area_start; - reserved_area_end = reserved_area_start + - fw_dump.reserve_dump_area_size; - /* - * Setup reserve_dump_area_start and its size so that we can - * reuse this reserved memory for Re-registration. - */ - fw_dump.reserve_dump_area_start = destination_address; - fw_dump.reserve_dump_area_size = get_fadump_area_size(); - - fadump_release_memory(reserved_area_start, reserved_area_end); + fadump_release_memory(fw_dump.boot_memory_size, memblock_end_of_DRAM()); fadump_free_cpu_notes_buf(); /* Initialize the kernel dump memory structure for FAD registration. */ -- cgit v1.2.3 From f35120115b767c49ad8de56dd78c86540a14df5b Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:19:44 +0530 Subject: pseries/fadump: move out platform specific support from generic code Move code that supports processing the crash'ed kernel's memory preserved by firmware to platform specific callback functions. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821337690.5656.13050665924800177744.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 329 ++------------------------- arch/powerpc/platforms/pseries/rtas-fadump.c | 263 ++++++++++++++++++++- 2 files changed, 274 insertions(+), 318 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index bff909151b44..9d9f7c384a71 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -28,15 +28,11 @@ #include #include #include -#include #include #include #include -#include "../platforms/pseries/rtas-fadump.h" - static struct fw_dump fw_dump; -static const struct rtas_fadump_mem_struct *fdm_active; static DEFINE_MUTEX(fadump_mutex); struct fad_crash_memory_ranges *crash_memory_ranges; @@ -108,22 +104,13 @@ static int __init fadump_cma_init(void) { return 1; } #endif /* CONFIG_CMA */ /* Scan the Firmware Assisted dump configuration details. */ -int __init early_init_dt_scan_fw_dump(unsigned long node, - const char *uname, int depth, void *data) +int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data) { if (depth != 1 || strcmp(uname, "rtas") != 0) return 0; rtas_fadump_dt_scan(&fw_dump, node); - - /* - * The 'ibm,kernel-dump' rtas node is present only if there is - * dump data waiting for us. - */ - fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); - if (fdm_active) - fw_dump.dump_active = 1; - return 1; } @@ -358,9 +345,7 @@ int __init fadump_reserve_mem(void) * If dump is active then we have already calculated the size during * first kernel. */ - if (fdm_active) - fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len); - else { + if (!fw_dump.dump_active) { fw_dump.boot_memory_size = fadump_calculate_reserve_size(); #ifdef CONFIG_CMA if (!fw_dump.nocma) @@ -414,17 +399,9 @@ int __init fadump_reserve_mem(void) size = memory_boundary - base; fadump_reserve_crash_area(base, size); - fw_dump.fadumphdr_addr = - be64_to_cpu(fdm_active->rmr_region.destination_address) + - be64_to_cpu(fdm_active->rmr_region.source_len); - pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr); - - /* - * Start address of reserve dump area (permanent reservation) - * for re-registering FADump after dump capture. - */ - fw_dump.reserve_dump_area_start = - be64_to_cpu(fdm_active->cpu_state_data.destination_address); + pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr); + pr_debug("Reserve dump area start address: 0x%lx\n", + fw_dump.reserve_dump_area_start); } else { /* * Reserve memory at an offset closer to bottom of the RAM to @@ -539,66 +516,6 @@ void crash_fadump(struct pt_regs *regs, const char *str) fw_dump.ops->fadump_trigger(fdh, str); } -#define GPR_MASK 0xffffff0000000000 -static inline int fadump_gpr_index(u64 id) -{ - int i = -1; - char str[3]; - - if ((id & GPR_MASK) == fadump_str_to_u64("GPR")) { - /* get the digits at the end */ - id &= ~GPR_MASK; - id >>= 24; - str[2] = '\0'; - str[1] = id & 0xff; - str[0] = (id >> 8) & 0xff; - sscanf(str, "%d", &i); - if (i > 31) - i = -1; - } - return i; -} - -static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, - u64 reg_val) -{ - int i; - - i = fadump_gpr_index(reg_id); - if (i >= 0) - regs->gpr[i] = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("NIA")) - regs->nip = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("MSR")) - regs->msr = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("CTR")) - regs->ctr = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("LR")) - regs->link = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("XER")) - regs->xer = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("CR")) - regs->ccr = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("DAR")) - regs->dar = (unsigned long)reg_val; - else if (reg_id == fadump_str_to_u64("DSISR")) - regs->dsisr = (unsigned long)reg_val; -} - -static struct rtas_fadump_reg_entry* -fadump_read_registers(struct rtas_fadump_reg_entry *reg_entry, struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - - while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) { - fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), - be64_to_cpu(reg_entry->reg_value)); - reg_entry++; - } - reg_entry++; - return reg_entry; -} - u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -686,147 +603,6 @@ void fadump_free_cpu_notes_buf(void) fw_dump.cpu_notes_buf_size = 0; } -/* - * Read CPU state dump data and convert it into ELF notes. - * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be - * used to access the data to allow for additional fields to be added without - * affecting compatibility. Each list of registers for a CPU starts with - * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, - * 8 Byte ASCII identifier and 8 Byte register value. The register entry - * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part - * of register value. For more details refer to PAPR document. - * - * Only for the crashing cpu we ignore the CPU dump data and get exact - * state from fadump crash info structure populated by first kernel at the - * time of crash. - */ -static int __init fadump_build_cpu_notes(const struct rtas_fadump_mem_struct *fdm) -{ - struct rtas_fadump_reg_save_area_header *reg_header; - struct rtas_fadump_reg_entry *reg_entry; - struct fadump_crash_info_header *fdh = NULL; - void *vaddr; - unsigned long addr; - u32 num_cpus, *note_buf; - struct pt_regs regs; - int i, rc = 0, cpu = 0; - - if (!fdm->cpu_state_data.bytes_dumped) - return -EINVAL; - - addr = be64_to_cpu(fdm->cpu_state_data.destination_address); - vaddr = __va(addr); - - reg_header = vaddr; - if (be64_to_cpu(reg_header->magic_number) != - fadump_str_to_u64("REGSAVE")) { - printk(KERN_ERR "Unable to read register save area.\n"); - return -ENOENT; - } - pr_debug("--------CPU State Data------------\n"); - pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number)); - pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset)); - - vaddr += be32_to_cpu(reg_header->num_cpu_offset); - num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); - pr_debug("NumCpus : %u\n", num_cpus); - vaddr += sizeof(u32); - reg_entry = (struct rtas_fadump_reg_entry *)vaddr; - - rc = fadump_setup_cpu_notes_buf(num_cpus); - if (rc != 0) - return rc; - - note_buf = (u32 *)fw_dump.cpu_notes_buf_vaddr; - - if (fw_dump.fadumphdr_addr) - fdh = __va(fw_dump.fadumphdr_addr); - - for (i = 0; i < num_cpus; i++) { - if (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUSTRT")) { - printk(KERN_ERR "Unable to read CPU state data\n"); - rc = -ENOENT; - goto error_out; - } - /* Lower 4 bytes of reg_value contains logical cpu id */ - cpu = be64_to_cpu(reg_entry->reg_value) & RTAS_FADUMP_CPU_ID_MASK; - if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { - RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); - continue; - } - pr_debug("Reading register data for cpu %d...\n", cpu); - if (fdh && fdh->crashing_cpu == cpu) { - regs = fdh->regs; - note_buf = fadump_regs_to_elf_notes(note_buf, ®s); - RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); - } else { - reg_entry++; - reg_entry = fadump_read_registers(reg_entry, ®s); - note_buf = fadump_regs_to_elf_notes(note_buf, ®s); - } - } - final_note(note_buf); - - if (fdh) { - pr_debug("Updating elfcore header (%llx) with cpu notes\n", - fdh->elfcorehdr_addr); - fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); - } - return 0; - -error_out: - fadump_free_cpu_notes_buf(); - return rc; - -} - -/* - * Validate and process the dump data stored by firmware before exporting - * it through '/proc/vmcore'. - */ -static int __init process_fadump(const struct rtas_fadump_mem_struct *fdm_active) -{ - struct fadump_crash_info_header *fdh; - int rc = 0; - - if (!fdm_active || !fw_dump.fadumphdr_addr) - return -EINVAL; - - /* Check if the dump data is valid. */ - if ((be16_to_cpu(fdm_active->header.dump_status_flag) == RTAS_FADUMP_ERROR_FLAG) || - (fdm_active->cpu_state_data.error_flags != 0) || - (fdm_active->rmr_region.error_flags != 0)) { - printk(KERN_ERR "Dump taken by platform is not valid\n"); - return -EINVAL; - } - if ((fdm_active->rmr_region.bytes_dumped != - fdm_active->rmr_region.source_len) || - !fdm_active->cpu_state_data.bytes_dumped) { - printk(KERN_ERR "Dump taken by platform is incomplete\n"); - return -EINVAL; - } - - /* Validate the fadump crash info header */ - fdh = __va(fw_dump.fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - printk(KERN_ERR "Crash info header is not valid.\n"); - return -EINVAL; - } - - rc = fadump_build_cpu_notes(fdm_active); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; - - return 0; -} - static void free_crash_memory_ranges(void) { kfree(crash_memory_ranges); @@ -1116,7 +892,6 @@ static unsigned long init_fadump_header(unsigned long addr) if (!addr) return 0; - fw_dump.fadumphdr_addr = addr; fdh = __va(addr); addr += sizeof(struct fadump_crash_info_header); @@ -1160,39 +935,12 @@ static int register_fadump(void) return fw_dump.ops->fadump_register(&fw_dump); } -static int fadump_invalidate_dump(const struct rtas_fadump_mem_struct *fdm) -{ - int rc = 0; - unsigned int wait_time; - - pr_debug("Invalidating firmware-assisted dump registration\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_INVALIDATE, fdm, - sizeof(struct rtas_fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - } while (wait_time); - - if (rc) { - pr_err("Failed to invalidate firmware-assisted dump registration. Unexpected error (%d).\n", rc); - return rc; - } - fw_dump.dump_active = 0; - fdm_active = NULL; - return 0; -} - void fadump_cleanup(void) { /* Invalidate the registration only if dump is active. */ if (fw_dump.dump_active) { - /* pass the same memory dump structure provided by platform */ - fadump_invalidate_dump(fdm_active); + pr_debug("Invalidating firmware-assisted dump registration\n"); + fw_dump.ops->fadump_invalidate(&fw_dump); } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ fw_dump.ops->fadump_unregister(&fw_dump); @@ -1333,7 +1081,7 @@ static ssize_t fadump_register_store(struct kobject *kobj, int ret = 0; int input = -1; - if (!fw_dump.fadump_enabled || fdm_active) + if (!fw_dump.fadump_enabled || fw_dump.dump_active) return -EPERM; if (kstrtoint(buf, 0, &input)) @@ -1346,6 +1094,7 @@ static ssize_t fadump_register_store(struct kobject *kobj, if (fw_dump.dump_registered == 0) { goto unlock_out; } + /* Un-register Firmware-assisted dump */ pr_debug("Un-register firmware-assisted dump\n"); fw_dump.ops->fadump_unregister(&fw_dump); @@ -1370,63 +1119,12 @@ unlock_out: static int fadump_region_show(struct seq_file *m, void *private) { - const struct rtas_fadump_mem_struct *fdm_ptr; - if (!fw_dump.fadump_enabled) return 0; mutex_lock(&fadump_mutex); - if (fdm_active) - fdm_ptr = fdm_active; - else { - mutex_unlock(&fadump_mutex); - fw_dump.ops->fadump_region_show(&fw_dump, m); - return 0; - } - - seq_printf(m, - "CPU : [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address), - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) + - be64_to_cpu(fdm_ptr->cpu_state_data.source_len) - 1, - be64_to_cpu(fdm_ptr->cpu_state_data.source_len), - be64_to_cpu(fdm_ptr->cpu_state_data.bytes_dumped)); - seq_printf(m, - "HPTE: [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->hpte_region.destination_address), - be64_to_cpu(fdm_ptr->hpte_region.destination_address) + - be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, - be64_to_cpu(fdm_ptr->hpte_region.source_len), - be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); - seq_printf(m, - "DUMP: [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->rmr_region.destination_address), - be64_to_cpu(fdm_ptr->rmr_region.destination_address) + - be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1, - be64_to_cpu(fdm_ptr->rmr_region.source_len), - be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); - - if (!fdm_active || - (fw_dump.reserve_dump_area_start == - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address))) - goto out; - - /* Dump is active. Show reserved memory region. */ - seq_printf(m, - " : [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - (unsigned long long)fw_dump.reserve_dump_area_start, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - 1, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - - fw_dump.reserve_dump_area_start, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - - fw_dump.reserve_dump_area_start); -out: - if (fdm_active) - mutex_unlock(&fadump_mutex); + fw_dump.ops->fadump_region_show(&fw_dump, m); + mutex_unlock(&fadump_mutex); return 0; } @@ -1497,12 +1195,13 @@ int __init setup_fadump(void) * if dump process fails then invalidate the registration * and release memory before proceeding for re-registration. */ - if (process_fadump(fdm_active) < 0) + if (fw_dump.ops->fadump_process(&fw_dump) < 0) fadump_invalidate_release_mem(); } /* Initialize the kernel dump memory structure for FAD registration. */ else if (fw_dump.reserve_dump_area_size) fw_dump.ops->fadump_init_mem_struct(&fw_dump); + fadump_init_files(); return 1; diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index a11e3c2458d2..bf1e8fc549b5 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -23,6 +23,7 @@ #include "rtas-fadump.h" static struct rtas_fadump_mem_struct fdm; +static const struct rtas_fadump_mem_struct *fdm_active; static void rtas_fadump_update_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) @@ -34,6 +35,25 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf, fadump_conf->boot_memory_size); } +/* + * This function is called in the capture kernel to get configuration details + * setup in the first kernel and passed to the f/w. + */ +static void rtas_fadump_get_config(struct fw_dump *fadump_conf, + const struct rtas_fadump_mem_struct *fdm) +{ + fadump_conf->boot_memory_size = be64_to_cpu(fdm->rmr_region.source_len); + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = + be64_to_cpu(fdm->cpu_state_data.destination_address); + + rtas_fadump_update_config(fadump_conf, fdm); +} + static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) { u64 addr = fadump_conf->reserve_dump_area_start; @@ -171,7 +191,183 @@ static int rtas_fadump_unregister(struct fw_dump *fadump_conf) static int rtas_fadump_invalidate(struct fw_dump *fadump_conf) { - return -EIO; + unsigned int wait_time; + int rc; + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, + NULL, FADUMP_INVALIDATE, fdm_active, + sizeof(struct rtas_fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + pr_err("Failed to invalidate - unexpected error (%d).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + fdm_active = NULL; + return 0; +} + +#define RTAS_FADUMP_GPR_MASK 0xffffff0000000000 +static inline int rtas_fadump_gpr_index(u64 id) +{ + char str[3]; + int i = -1; + + if ((id & RTAS_FADUMP_GPR_MASK) == fadump_str_to_u64("GPR")) { + /* get the digits at the end */ + id &= ~RTAS_FADUMP_GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + if (kstrtoint(str, 10, &i)) + i = -EINVAL; + if (i > 31) + i = -1; + } + return i; +} + +void rtas_fadump_set_regval(struct pt_regs *regs, u64 reg_id, u64 reg_val) +{ + int i; + + i = rtas_fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == fadump_str_to_u64("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct rtas_fadump_reg_entry* +rtas_fadump_read_regs(struct rtas_fadump_reg_entry *reg_entry, + struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (be64_to_cpu(reg_entry->reg_id) != fadump_str_to_u64("CPUEND")) { + rtas_fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), + be64_to_cpu(reg_entry->reg_value)); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) +{ + struct rtas_fadump_reg_save_area_header *reg_header; + struct fadump_crash_info_header *fdh = NULL; + struct rtas_fadump_reg_entry *reg_entry; + u32 num_cpus, *note_buf; + int i, rc = 0, cpu = 0; + struct pt_regs regs; + unsigned long addr; + void *vaddr; + + addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address); + vaddr = __va(addr); + + reg_header = vaddr; + if (be64_to_cpu(reg_header->magic_number) != + fadump_str_to_u64("REGSAVE")) { + pr_err("Unable to read register save area.\n"); + return -ENOENT; + } + + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number)); + pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset)); + + vaddr += be32_to_cpu(reg_header->num_cpu_offset); + num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct rtas_fadump_reg_entry *)vaddr; + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + + if (fadump_conf->fadumphdr_addr) + fdh = __va(fadump_conf->fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (be64_to_cpu(reg_entry->reg_id) != + fadump_str_to_u64("CPUSTRT")) { + pr_err("Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = (be64_to_cpu(reg_entry->reg_value) & + RTAS_FADUMP_CPU_ID_MASK); + if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + RTAS_FADUMP_SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = rtas_fadump_read_regs(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + final_note(note_buf); + + if (fdh) { + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + } + return 0; + +error_out: + fadump_free_cpu_notes_buf(); + return rc; + } /* @@ -180,14 +376,58 @@ static int rtas_fadump_invalidate(struct fw_dump *fadump_conf) */ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) { - return -EINVAL; + struct fadump_crash_info_header *fdh; + int rc = 0; + + if (!fdm_active || !fadump_conf->fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((be16_to_cpu(fdm_active->header.dump_status_flag) == + RTAS_FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || + (fdm_active->rmr_region.error_flags != 0)) { + pr_err("Dump taken by platform is not valid\n"); + return -EINVAL; + } + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { + pr_err("Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return -EINVAL; + } + + rc = rtas_fadump_build_cpu_notes(fadump_conf); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; } static void rtas_fadump_region_show(struct fw_dump *fadump_conf, struct seq_file *m) { - const struct rtas_fadump_mem_struct *fdm_ptr = &fdm; const struct rtas_fadump_section *cpu_data_section; + const struct rtas_fadump_mem_struct *fdm_ptr; + + if (fdm_active) + fdm_ptr = fdm_active; + else + fdm_ptr = &fdm; cpu_data_section = &(fdm_ptr->cpu_state_data); seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", @@ -210,6 +450,12 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf, seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", be64_to_cpu(fdm_ptr->rmr_region.source_len), be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); + + /* Dump is active. Show reserved area start address. */ + if (fdm_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } } static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, @@ -247,6 +493,17 @@ void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) fadump_conf->ops = &rtas_fadump_ops; fadump_conf->fadump_supported = 1; + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) { + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + rtas_fadump_get_config(fadump_conf, (void *)__pa(fdm_active)); + } + /* Get the sizes required to store dump data for the firmware provided * dump sections. * For each dump section type supported, a 32bit cell which defines -- cgit v1.2.3 From 1679b96e69ec2e10ccd9d82b691ade0b16cb64d1 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:19:58 +0530 Subject: powerpc/fadump: use FADump instead of fadump for how it is pronounced fadump is pronounced f-a-dump. Update documentation accordingly. Also, update how fadump_region contents look like with recent changes. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821339317.5656.15852294223821732082.stgit@hbathini.in.ibm.com --- Documentation/powerpc/firmware-assisted-dump.rst | 55 +++++++++++++----------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index 563c021df539..d912755067d2 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -9,18 +9,18 @@ a crashed system, and to do so from a fully-reset system, and to minimize the total elapsed time until the system is back in production use. -- Firmware assisted dump (fadump) infrastructure is intended to replace +- Firmware-Assisted Dump (FADump) infrastructure is intended to replace the existing phyp assisted dump. - Fadump uses the same firmware interfaces and memory reservation model as phyp assisted dump. -- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore +- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore in the ELF format in the same way as kdump. This helps us reuse the kdump infrastructure for dump capture and filtering. - Unlike phyp dump, userspace tool does not need to refer any sysfs interface while reading /proc/vmcore. -- Unlike phyp dump, fadump allows user to release all the memory reserved +- Unlike phyp dump, FADump allows user to release all the memory reserved for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem. -- Once enabled through kernel boot parameter, fadump can be +- Once enabled through kernel boot parameter, FADump can be started/stopped through /sys/kernel/fadump_registered interface (see sysfs files section below) and can be easily integrated with kdump service start/stop init scripts. @@ -34,7 +34,7 @@ dump offers several strong, practical advantages: in a clean, consistent state. - Once the dump is copied out, the memory that held the dump is immediately available to the running kernel. And therefore, - unlike kdump, fadump doesn't need a 2nd reboot to get back + unlike kdump, FADump doesn't need a 2nd reboot to get back the system to the production configuration. The above can only be accomplished by coordination with, @@ -63,7 +63,7 @@ as follows: boot successfully. For syntax of crashkernel= parameter, refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is provided in crashkernel= parameter, it will be ignored - as fadump uses a predefined offset to reserve memory + as FADump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. - After the low memory (boot memory) area has been saved, the @@ -123,7 +123,7 @@ blocking this significant chunk of memory from production kernel. Hence, the implementation uses the Linux kernel's Contiguous Memory Allocator (CMA) for memory reservation if CMA is configured for kernel. With CMA reservation this memory will be available for applications to -use it, while kernel is prevented from using it. With this fadump will +use it, while kernel is prevented from using it. With this FADump will still be able to capture all of the kernel memory and most of the user space memory except the user pages that were present in CMA region:: @@ -173,14 +173,14 @@ KDump, as dump mechanism. The tools to examine the dump will be same as the ones used for kdump. -How to enable firmware-assisted dump (fadump): +How to enable firmware-assisted dump (FADump): ---------------------------------------------- 1. Set config option CONFIG_FA_DUMP=y and build kernel. 2. Boot into linux kernel with 'fadump=on' kernel cmdline option. - By default, fadump reserved memory will be initialized as CMA area. + By default, FADump reserved memory will be initialized as CMA area. Alternatively, user can boot linux kernel with 'fadump=nocma' to - prevent fadump to use CMA. + prevent FADump to use CMA. 3. Optionally, user can also set 'crashkernel=' kernel cmdline to specify size of the memory to reserve for boot memory dump preservation. @@ -206,29 +206,29 @@ the control files and debugfs file to display memory reserved region. Here is the list of files under kernel sysfs: /sys/kernel/fadump_enabled - This is used to display the fadump status. + This is used to display the FADump status. - - 0 = fadump is disabled - - 1 = fadump is enabled + - 0 = FADump is disabled + - 1 = FADump is enabled This interface can be used by kdump init scripts to identify if - fadump is enabled in the kernel and act accordingly. + FADump is enabled in the kernel and act accordingly. /sys/kernel/fadump_registered - This is used to display the fadump registration status as well - as to control (start/stop) the fadump registration. + This is used to display the FADump registration status as well + as to control (start/stop) the FADump registration. - - 0 = fadump is not registered. - - 1 = fadump is registered and ready to handle system crash. + - 0 = FADump is not registered. + - 1 = FADump is registered and ready to handle system crash. - To register fadump echo 1 > /sys/kernel/fadump_registered and + To register FADump echo 1 > /sys/kernel/fadump_registered and echo 0 > /sys/kernel/fadump_registered for un-register and stop the - fadump. Once the fadump is un-registered, the system crash will not + FADump. Once the FADump is un-registered, the system crash will not be handled and vmcore will not be captured. This interface can be easily integrated with kdump service start/stop. /sys/kernel/fadump_release_mem - This file is available only when fadump is active during + This file is available only when FADump is active during second kernel. This is used to release the reserved memory region that are held for saving crash dump. To release the reserved memory echo 1 to it:: @@ -246,21 +246,25 @@ Here is the list of files under powerpc debugfs: (Assuming debugfs is mounted on /sys/kernel/debug directory.) /sys/kernel/debug/powerpc/fadump_region - This file shows the reserved memory regions if fadump is + This file shows the reserved memory regions if FADump is enabled otherwise this file is empty. The output format is:: : [-] bytes, Dumped: + and for kernel DUMP region is: + + DUMP: Src: , Dest: , Size: , Dumped: # bytes + e.g. - Contents when fadump is registered during first kernel:: + Contents when FADump is registered during first kernel:: # cat /sys/kernel/debug/powerpc/fadump_region CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x0 HPTE: [0x0000006fff0020-0x0000006fff101f] 0x1000 bytes, Dumped: 0x0 DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x0 - Contents when fadump is active during second kernel:: + Contents when FADump is active during second kernel:: # cat /sys/kernel/debug/powerpc/fadump_region CPU : [0x0000006ffb0000-0x0000006fff001f] 0x40020 bytes, Dumped: 0x40020 @@ -268,6 +272,7 @@ Here is the list of files under powerpc debugfs: DUMP: [0x0000006fff1020-0x0000007fff101f] 0x10000000 bytes, Dumped: 0x10000000 : [0x00000010000000-0x0000006ffaffff] 0x5ffb0000 bytes, Dumped: 0x5ffb0000 + NOTE: Please refer to Documentation/filesystems/debugfs.txt on how to mount the debugfs filesystem. @@ -278,7 +283,7 @@ TODO: - Need to come up with the better approach to find out more accurate boot memory size that is required for a kernel to boot successfully when booted with restricted memory. - - The fadump implementation introduces a fadump crash info structure + - The FADump implementation introduces a FADump crash info structure in the scratch area before the ELF core header. The idea of introducing this structure is to pass some important crash info data to the second kernel which will help second kernel to populate ELF core header with -- cgit v1.2.3 From 6f5f193e84d3d7b55d406ecc7ac4ea1ef1d1876f Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:20:12 +0530 Subject: powerpc/opal: add MPIPL interface definitions MPIPL is Memory Preserving IPL supported from POWER9. This enables the kernel to reset the system with memory 'preserved'. Also, it supports copying memory from a source address to some destination address during MPIPL boot. Add MPIPL interface definitions here to leverage these f/w features in adding FADump support for PowerNV platform. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821340710.5656.10071829040515662624.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/opal-api.h | 44 +++++++++++++++++++++++++++++- arch/powerpc/include/asm/opal.h | 5 ++++ arch/powerpc/platforms/powernv/opal-call.c | 3 ++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 1cad413e1e0e..378e3997845a 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -208,7 +208,10 @@ #define OPAL_HANDLE_HMI2 166 #define OPAL_NX_COPROC_INIT 167 #define OPAL_XIVE_GET_VP_STATE 170 -#define OPAL_LAST 170 +#define OPAL_MPIPL_UPDATE 173 +#define OPAL_MPIPL_REGISTER_TAG 174 +#define OPAL_MPIPL_QUERY_TAG 175 +#define OPAL_LAST 175 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -1060,6 +1063,7 @@ enum { OPAL_REBOOT_NORMAL = 0, OPAL_REBOOT_PLATFORM_ERROR = 1, OPAL_REBOOT_FULL_IPL = 2, + OPAL_REBOOT_MPIPL = 3, }; /* Argument to OPAL_PCI_TCE_KILL */ @@ -1136,6 +1140,44 @@ enum { #define OPAL_PCI_P2P_LOAD 0x2 #define OPAL_PCI_P2P_STORE 0x4 +/* MPIPL update operations */ +enum opal_mpipl_ops { + OPAL_MPIPL_ADD_RANGE = 0, + OPAL_MPIPL_REMOVE_RANGE = 1, + OPAL_MPIPL_REMOVE_ALL = 2, + OPAL_MPIPL_FREE_PRESERVED_MEMORY = 3, +}; + +/* Tag will point to various metadata area. Kernel will + * use tag to get metadata value. + */ +enum opal_mpipl_tags { + OPAL_MPIPL_TAG_CPU = 0, + OPAL_MPIPL_TAG_OPAL = 1, + OPAL_MPIPL_TAG_KERNEL = 2, + OPAL_MPIPL_TAG_BOOT_MEM = 3, +}; + +/* Preserved memory details */ +struct opal_mpipl_region { + __be64 src; + __be64 dest; + __be64 size; +}; + +/* Structure version */ +#define OPAL_MPIPL_VERSION 0x01 + +struct opal_mpipl_fadump { + u8 version; + u8 reserved[7]; + __be32 crashing_pir; /* OPAL crashing CPU PIR */ + __be32 cpu_data_version; + __be32 cpu_data_size; + __be32 region_cnt; + struct opal_mpipl_region region[]; +} __packed; + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index d5a0807d21db..a0cf8fba4d12 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -39,6 +39,7 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t bdfn, uint64_t PE_handle); int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap, uint64_t rate_phys, uint32_t size); + int64_t opal_console_write(int64_t term_number, __be64 *length, const uint8_t *buffer); int64_t opal_console_read(int64_t term_number, __be64 *length, @@ -297,6 +298,10 @@ int opal_sensor_group_clear(u32 group_hndl, int token); int opal_sensor_group_enable(u32 group_hndl, int token, bool enable); int opal_nx_coproc_init(uint32_t chip_id, uint32_t ct); +s64 opal_mpipl_update(enum opal_mpipl_ops op, u64 src, u64 dest, u64 size); +s64 opal_mpipl_register_tag(enum opal_mpipl_tags tag, u64 addr); +s64 opal_mpipl_query_tag(enum opal_mpipl_tags tag, u64 *addr); + s64 opal_signal_system_reset(s32 cpu); s64 opal_quiesce(u64 shutdown_type, s32 cpu); diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index dccdc9df5213..a2aa5e433ac8 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -287,3 +287,6 @@ OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); +OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE); +OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG); +OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG); -- cgit v1.2.3 From 41df5928721ff4b5f83767cd5e8b77862fc62bb3 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:20:26 +0530 Subject: powerpc/fadump: add fadump support on powernv Add basic callback functions for FADump on PowerNV platform. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821342072.5656.4346362203141486452.stgit@hbathini.in.ibm.com --- arch/powerpc/Kconfig | 5 +- arch/powerpc/include/asm/fadump-internal.h | 7 +++ arch/powerpc/kernel/fadump.c | 15 ++++- arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/opal-fadump.c | 92 ++++++++++++++++++++++++++++ 5 files changed, 115 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index edc19363a729..375ff893fa60 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -569,7 +569,7 @@ config CRASH_DUMP config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && PPC_RTAS + depends on PPC64 && (PPC_RTAS || PPC_POWERNV) select CRASH_CORE select CRASH_DUMP help @@ -580,7 +580,8 @@ config FA_DUMP is meant to be a kdump replacement offering robustness and speed not possible without system firmware assistance. - If unsure, say "N" + If unsure, say "y". Only special kernels like petitboot may + need to say "N" here. config IRQ_ALL_CPUS bool "Distribute interrupts on all CPUs by default" diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 8c0eb0fc09d0..8046fe0b742e 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -135,4 +135,11 @@ static inline void rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { } #endif +#ifdef CONFIG_PPC_POWERNV +extern void opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node); +#else +static inline void +opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { } +#endif + #endif /* _ASM_POWERPC_FADUMP_INTERNAL_H */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 9d9f7c384a71..b17673d8d50b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -107,11 +107,20 @@ static int __init fadump_cma_init(void) { return 1; } int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) { - if (depth != 1 || strcmp(uname, "rtas") != 0) + if (depth != 1) return 0; - rtas_fadump_dt_scan(&fw_dump, node); - return 1; + if (strcmp(uname, "rtas") == 0) { + rtas_fadump_dt_scan(&fw_dump, node); + return 1; + } + + if (strcmp(uname, "ibm,opal") == 0) { + opal_fadump_dt_scan(&fw_dump, node); + return 1; + } + + return 0; } /* diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 49186f0985a4..3e6d6d6d1ec1 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -7,6 +7,7 @@ obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o +obj-$(CONFIG_FA_DUMP) += opal-fadump.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c new file mode 100644 index 000000000000..bbc5356bdc77 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal fadump: " fmt + +#include +#include +#include +#include + +#include +#include + +static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + return fadump_conf->reserve_dump_area_start; +} + +static int opal_fadump_register(struct fw_dump *fadump_conf) +{ + return -EIO; +} + +static int opal_fadump_unregister(struct fw_dump *fadump_conf) +{ + return -EIO; +} + +static int opal_fadump_invalidate(struct fw_dump *fadump_conf) +{ + return -EIO; +} + +static int __init opal_fadump_process(struct fw_dump *fadump_conf) +{ + return -EINVAL; +} + +static void opal_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ +} + +static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + int rc; + + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg); + if (rc == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported.\n", + OPAL_REBOOT_MPIPL); + } else if (rc == OPAL_HARDWARE) + pr_emerg("No backend support for MPIPL!\n"); +} + +static struct fadump_ops opal_fadump_ops = { + .fadump_init_mem_struct = opal_fadump_init_mem_struct, + .fadump_register = opal_fadump_register, + .fadump_unregister = opal_fadump_unregister, + .fadump_invalidate = opal_fadump_invalidate, + .fadump_process = opal_fadump_process, + .fadump_region_show = opal_fadump_region_show, + .fadump_trigger = opal_fadump_trigger, +}; + +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + unsigned long dn; + + /* + * Check if Firmware-Assisted Dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) { + pr_debug("FADump support is missing!\n"); + return; + } + + if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) { + pr_err("Support missing for this f/w version!\n"); + return; + } + + fadump_conf->ops = &opal_fadump_ops; + fadump_conf->fadump_supported = 1; +} -- cgit v1.2.3 From 6abec12c65e8870d8cafe154a86240fe0bcdd4f7 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:20:41 +0530 Subject: powerpc/fadump: improve fadump_reserve_mem() Some code clean-up like using minimal assignments and updating printk messages. Also, add an 'error_out' label for handling error cleanup at one place. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821343485.5656.10202857091553646948.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 48 +++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b17673d8d50b..7d47d4bb7d6e 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -338,16 +338,15 @@ static void __init fadump_reserve_crash_area(unsigned long base, int __init fadump_reserve_mem(void) { - unsigned long base, size, memory_boundary; + u64 base, size, mem_boundary; + int ret = 1; if (!fw_dump.fadump_enabled) return 0; if (!fw_dump.fadump_supported) { - printk(KERN_INFO "Firmware-assisted dump is not supported on" - " this hardware\n"); - fw_dump.fadump_enabled = 0; - return 0; + pr_info("Firmware-Assisted Dump is not supported on this hardware\n"); + goto error_out; } /* * Initialize boot memory size @@ -355,7 +354,8 @@ int __init fadump_reserve_mem(void) * first kernel. */ if (!fw_dump.dump_active) { - fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + fw_dump.boot_memory_size = + PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA if (!fw_dump.nocma) fw_dump.boot_memory_size = @@ -381,10 +381,11 @@ int __init fadump_reserve_mem(void) " dump, now %#016llx\n", memory_limit); } if (memory_limit) - memory_boundary = memory_limit; + mem_boundary = memory_limit; else - memory_boundary = memblock_end_of_DRAM(); + mem_boundary = memblock_end_of_DRAM(); + base = fw_dump.boot_memory_size; size = get_fadump_area_size(); fw_dump.reserve_dump_area_size = size; if (fw_dump.dump_active) { @@ -404,8 +405,7 @@ int __init fadump_reserve_mem(void) * dump is written to disk by userspace tool. This memory * will be released for general use once the dump is saved. */ - base = fw_dump.boot_memory_size; - size = memory_boundary - base; + size = mem_boundary - base; fadump_reserve_crash_area(base, size); pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr); @@ -418,29 +418,31 @@ int __init fadump_reserve_mem(void) * use memblock_find_in_range() here since it doesn't allocate * from bottom to top. */ - for (base = fw_dump.boot_memory_size; - base <= (memory_boundary - size); - base += size) { + while (base <= (mem_boundary - size)) { if (memblock_is_region_memory(base, size) && !memblock_is_region_reserved(base, size)) break; + + base += size; } - if ((base > (memory_boundary - size)) || + + if ((base > (mem_boundary - size)) || memblock_reserve(base, size)) { - pr_err("Failed to reserve memory\n"); - return 0; + pr_err("Failed to reserve memory!\n"); + goto error_out; } - pr_info("Reserved %ldMB of memory at %ldMB for firmware-" - "assisted dump (System RAM: %ldMB)\n", - (unsigned long)(size >> 20), - (unsigned long)(base >> 20), - (unsigned long)(memblock_phys_mem_size() >> 20)); + pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n", + (size >> 20), base, (memblock_phys_mem_size() >> 20)); fw_dump.reserve_dump_area_start = base; - return fadump_cma_init(); + ret = fadump_cma_init(); } - return 1; + + return ret; +error_out: + fw_dump.fadump_enabled = 0; + return 0; } unsigned long __init arch_reserved_kernel_pages(void) -- cgit v1.2.3 From 742a265accd3e3afcc8e7b17f409c93c1de8be85 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:20:57 +0530 Subject: powerpc/fadump: register kernel metadata address with opal OPAL allows registering address with it in the first kernel and retrieving it after MPIPL. Setup kernel metadata and register its address with OPAL to use it for processing the crash dump. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821345011.5656.13567765019032928471.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 4 ++ arch/powerpc/kernel/fadump.c | 23 +++++++- arch/powerpc/platforms/powernv/opal-fadump.c | 84 +++++++++++++++++++++++++++- arch/powerpc/platforms/powernv/opal-fadump.h | 39 +++++++++++++ 4 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 8046fe0b742e..5262c764c5fc 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -97,6 +97,8 @@ struct fw_dump { unsigned long cpu_notes_buf_vaddr; unsigned long cpu_notes_buf_size; + u64 kernel_metadata; + int ibm_configure_kernel_dump; unsigned long fadump_enabled:1; @@ -110,6 +112,8 @@ struct fw_dump { struct fadump_ops { u64 (*fadump_init_mem_struct)(struct fw_dump *fadump_conf); + u64 (*fadump_get_metadata_size)(void); + int (*fadump_setup_metadata)(struct fw_dump *fadump_conf); int (*fadump_register)(struct fw_dump *fadump_conf); int (*fadump_unregister)(struct fw_dump *fadump_conf); int (*fadump_invalidate)(struct fw_dump *fadump_conf); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 7d47d4bb7d6e..7e7056382d98 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -313,6 +313,10 @@ static unsigned long get_fadump_area_size(void) size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); size = PAGE_ALIGN(size); + + /* This is to hold kernel metadata on platforms that support it */ + size += (fw_dump.ops->fadump_get_metadata_size ? + fw_dump.ops->fadump_get_metadata_size() : 0); return size; } @@ -348,6 +352,7 @@ int __init fadump_reserve_mem(void) pr_info("Firmware-Assisted Dump is not supported on this hardware\n"); goto error_out; } + /* * Initialize boot memory size * If dump is active then we have already calculated the size during @@ -426,8 +431,21 @@ int __init fadump_reserve_mem(void) base += size; } - if ((base > (mem_boundary - size)) || - memblock_reserve(base, size)) { + if (base > (mem_boundary - size)) { + pr_err("Failed to find memory chunk for reservation!\n"); + goto error_out; + } + fw_dump.reserve_dump_area_start = base; + + /* + * Calculate the kernel metadata address and register it with + * f/w if the platform supports. + */ + if (fw_dump.ops->fadump_setup_metadata && + (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0)) + goto error_out; + + if (memblock_reserve(base, size)) { pr_err("Failed to reserve memory!\n"); goto error_out; } @@ -435,7 +453,6 @@ int __init fadump_reserve_mem(void) pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n", (size >> 20), base, (memblock_phys_mem_size() >> 20)); - fw_dump.reserve_dump_area_start = base; ret = fadump_cma_init(); } diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index bbc5356bdc77..21de832f2ba4 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -11,13 +11,83 @@ #include #include #include +#include +#include #include #include +#include "opal-fadump.h" + +static struct opal_fadump_mem_struct *opal_fdm; + +/* Initialize kernel metadata */ +static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) +{ + fdm->version = OPAL_FADUMP_VERSION; + fdm->region_cnt = 0; + fdm->registered_regions = 0; + fdm->fadumphdr_addr = 0; +} + static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) { - return fadump_conf->reserve_dump_area_start; + u64 addr = fadump_conf->reserve_dump_area_start; + + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + opal_fdm->region_cnt = 1; + opal_fdm->rgn[0].src = 0; + opal_fdm->rgn[0].dest = addr; + opal_fdm->rgn[0].size = fadump_conf->boot_memory_size; + addr += fadump_conf->boot_memory_size; + + /* + * Kernel metadata is passed to f/w and retrieved in capture kerenl. + * So, use it to save fadump header address instead of calculating it. + */ + opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest + + fadump_conf->boot_memory_size); + + return addr; +} + +static u64 opal_fadump_get_metadata_size(void) +{ + return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct)); +} + +static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) +{ + int err = 0; + s64 ret; + + /* + * Use the last page(s) in FADump memory reservation for + * kernel metadata. + */ + fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start + + fadump_conf->reserve_dump_area_size - + opal_fadump_get_metadata_size()); + pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata); + + /* Initialize kernel metadata before registering the address with f/w */ + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* + * Register metadata address with f/w. Can be retrieved in + * the capture kernel. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, + fadump_conf->kernel_metadata); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set kernel metadata tag!\n"); + err = -EPERM; + } + + return err; } static int opal_fadump_register(struct fw_dump *fadump_conf) @@ -43,6 +113,16 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) static void opal_fadump_region_show(struct fw_dump *fadump_conf, struct seq_file *m) { + const struct opal_fadump_mem_struct *fdm_ptr = opal_fdm; + u64 dumped_bytes = 0; + int i; + + for (i = 0; i < fdm_ptr->region_cnt; i++) { + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + fdm_ptr->rgn[i].size, dumped_bytes); + } } static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, @@ -60,6 +140,8 @@ static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, static struct fadump_ops opal_fadump_ops = { .fadump_init_mem_struct = opal_fadump_init_mem_struct, + .fadump_get_metadata_size = opal_fadump_get_metadata_size, + .fadump_setup_metadata = opal_fadump_setup_metadata, .fadump_register = opal_fadump_register, .fadump_unregister = opal_fadump_unregister, .fadump_invalidate = opal_fadump_invalidate, diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h new file mode 100644 index 000000000000..0b83d895485c --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _POWERNV_OPAL_FADUMP_H +#define _POWERNV_OPAL_FADUMP_H + +/* + * OPAL FADump metadata structure format version + * + * OPAL FADump kernel metadata structure stores kernel metadata needed to + * register-for/process crash dump. Format version is used to keep a tab on + * the changes in the structure format. The changes, if any, to the format + * are expected to be minimal and backward compatible. + */ +#define OPAL_FADUMP_VERSION 0x1 + +/* Maximum number of memory regions kernel supports */ +#define OPAL_FADUMP_MAX_MEM_REGS 128 + +/* + * OPAL FADump kernel metadata + * + * The address of this structure will be registered with f/w for retrieving + * and processing during crash dump. + */ +struct opal_fadump_mem_struct { + u8 version; + u8 reserved[3]; + u16 region_cnt; /* number of regions */ + u16 registered_regions; /* Regions registered for MPIPL */ + u64 fadumphdr_addr; + struct opal_mpipl_region rgn[OPAL_FADUMP_MAX_MEM_REGS]; +} __packed; + +#endif /* _POWERNV_OPAL_FADUMP_H */ -- cgit v1.2.3 From 2790d01d1e1d22735d848eec55668f7d44417e22 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:21:16 +0530 Subject: powerpc/fadump: reset metadata address during clean up During kexec boot, metadata address needs to be reset to avoid running into errors interpreting stale metadata address, in case the kexec'ed kernel crashes before metadata address could be setup again. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821346629.5656.10783321582005237813.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 1 + arch/powerpc/kernel/fadump.c | 6 ++++++ arch/powerpc/platforms/powernv/opal-fadump.c | 10 ++++++++++ 3 files changed, 17 insertions(+) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 5262c764c5fc..0c2c4f58d365 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -117,6 +117,7 @@ struct fadump_ops { int (*fadump_register)(struct fw_dump *fadump_conf); int (*fadump_unregister)(struct fw_dump *fadump_conf); int (*fadump_invalidate)(struct fw_dump *fadump_conf); + void (*fadump_cleanup)(struct fw_dump *fadump_conf); int (*fadump_process)(struct fw_dump *fadump_conf); void (*fadump_region_show)(struct fw_dump *fadump_conf, struct seq_file *m); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 7e7056382d98..aab9b4db0363 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -965,6 +965,9 @@ static int register_fadump(void) void fadump_cleanup(void) { + if (!fw_dump.fadump_supported) + return; + /* Invalidate the registration only if dump is active. */ if (fw_dump.dump_active) { pr_debug("Invalidating firmware-assisted dump registration\n"); @@ -974,6 +977,9 @@ void fadump_cleanup(void) fw_dump.ops->fadump_unregister(&fw_dump); free_crash_memory_ranges(); } + + if (fw_dump.ops->fadump_cleanup) + fw_dump.ops->fadump_cleanup(&fw_dump); } static void fadump_free_reserved_memory(unsigned long start_pfn, diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 21de832f2ba4..871a129c73d3 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -105,6 +105,15 @@ static int opal_fadump_invalidate(struct fw_dump *fadump_conf) return -EIO; } +static void opal_fadump_cleanup(struct fw_dump *fadump_conf) +{ + s64 ret; + + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0); + if (ret != OPAL_SUCCESS) + pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); +} + static int __init opal_fadump_process(struct fw_dump *fadump_conf) { return -EINVAL; @@ -145,6 +154,7 @@ static struct fadump_ops opal_fadump_ops = { .fadump_register = opal_fadump_register, .fadump_unregister = opal_fadump_unregister, .fadump_invalidate = opal_fadump_invalidate, + .fadump_cleanup = opal_fadump_cleanup, .fadump_process = opal_fadump_process, .fadump_region_show = opal_fadump_region_show, .fadump_trigger = opal_fadump_trigger, -- cgit v1.2.3 From a20a8fa42def548f46c7e0401a94f62b8e595883 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:21:33 +0530 Subject: powerpc/fadump: define OPAL register/un-register callback functions Make OPAL calls to register and un-register with firmware for MPIPL. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821348482.5656.13646250851483648241.stgit@hbathini.in.ibm.com --- arch/powerpc/platforms/powernv/opal-fadump.c | 79 +++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 871a129c73d3..b53007e82190 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -21,6 +21,22 @@ static struct opal_fadump_mem_struct *opal_fdm; +static int opal_fadump_unregister(struct fw_dump *fadump_conf); + +static void opal_fadump_update_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + /* + * The destination address of the first boot memory region is the + * destination address of boot memory regions. + */ + fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest; + pr_debug("Destination address of boot memory regions: %#016llx\n", + fadump_conf->boot_mem_dest_addr); + + fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr; +} + /* Initialize kernel metadata */ static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) { @@ -50,6 +66,8 @@ static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest + fadump_conf->boot_memory_size); + opal_fadump_update_config(fadump_conf, opal_fdm); + return addr; } @@ -92,12 +110,69 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) static int opal_fadump_register(struct fw_dump *fadump_conf) { - return -EIO; + s64 rc = OPAL_PARAMETER; + int i, err = -EIO; + + for (i = 0; i < opal_fdm->region_cnt; i++) { + rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE, + opal_fdm->rgn[i].src, + opal_fdm->rgn[i].dest, + opal_fdm->rgn[i].size); + if (rc != OPAL_SUCCESS) + break; + + opal_fdm->registered_regions++; + } + + switch (rc) { + case OPAL_SUCCESS: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_RESOURCE: + /* If MAX regions limit in f/w is hit, warn and proceed. */ + pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n", + (opal_fdm->region_cnt - opal_fdm->registered_regions)); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_PARAMETER: + pr_err("Failed to register. Parameter Error(%lld).\n", rc); + break; + case OPAL_HARDWARE: + pr_err("Support not available.\n"); + fadump_conf->fadump_supported = 0; + fadump_conf->fadump_enabled = 0; + break; + default: + pr_err("Failed to register. Unknown Error(%lld).\n", rc); + break; + } + + /* + * If some regions were registered before OPAL_MPIPL_ADD_RANGE + * OPAL call failed, unregister all regions. + */ + if ((err < 0) && (opal_fdm->registered_regions > 0)) + opal_fadump_unregister(fadump_conf); + + return err; } static int opal_fadump_unregister(struct fw_dump *fadump_conf) { - return -EIO; + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0); + if (rc) { + pr_err("Failed to un-register - unexpected Error(%lld).\n", rc); + return -EIO; + } + + opal_fdm->registered_regions = 0; + fadump_conf->dump_registered = 0; + return 0; } static int opal_fadump_invalidate(struct fw_dump *fadump_conf) -- cgit v1.2.3 From 51bba8edef90cf579dba16de912d3ef809fe1d77 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:21:46 +0530 Subject: powerpc/fadump: support copying multiple kernel boot memory regions Firmware uses a 32-bit field for size while copying/backing-up memory during MPIPL. So, the maximum value that could be represented with a PAGE_SIZE aligned 32-bit field will be the maximum copy size for a region but FADump capture kernel usually needs more memory than that to be preserved to avoid running into out of memory errors. So, request firmware to copy multiple kernel boot memory regions instead of just one (which worked fine for pseries as 64-bit field was used for size there). Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821350193.5656.3664853158523582019.stgit@hbathini.in.ibm.com --- arch/powerpc/platforms/powernv/opal-fadump.c | 34 ++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index b53007e82190..a4b96a7a5d6c 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -26,6 +26,8 @@ static int opal_fadump_unregister(struct fw_dump *fadump_conf); static void opal_fadump_update_config(struct fw_dump *fadump_conf, const struct opal_fadump_mem_struct *fdm) { + pr_debug("Boot memory regions count: %d\n", fdm->region_cnt); + /* * The destination address of the first boot memory region is the * destination address of boot memory regions. @@ -48,16 +50,34 @@ static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) { - u64 addr = fadump_conf->reserve_dump_area_start; + int max_copy_size, cur_size, size; + u64 src_addr, dest_addr; opal_fdm = __va(fadump_conf->kernel_metadata); opal_fadump_init_metadata(opal_fdm); - opal_fdm->region_cnt = 1; - opal_fdm->rgn[0].src = 0; - opal_fdm->rgn[0].dest = addr; - opal_fdm->rgn[0].size = fadump_conf->boot_memory_size; - addr += fadump_conf->boot_memory_size; + /* + * Firmware supports 32-bit field for size. Align it to PAGE_SIZE + * and request firmware to copy multiple kernel boot memory regions. + */ + max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE); + + /* Boot memory regions */ + src_addr = 0; + dest_addr = fadump_conf->reserve_dump_area_start; + size = fadump_conf->boot_memory_size; + while (size) { + cur_size = size > max_copy_size ? max_copy_size : size; + + opal_fdm->rgn[opal_fdm->region_cnt].src = src_addr; + opal_fdm->rgn[opal_fdm->region_cnt].dest = dest_addr; + opal_fdm->rgn[opal_fdm->region_cnt].size = cur_size; + + opal_fdm->region_cnt++; + dest_addr += cur_size; + src_addr += cur_size; + size -= cur_size; + } /* * Kernel metadata is passed to f/w and retrieved in capture kerenl. @@ -68,7 +88,7 @@ static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) opal_fadump_update_config(fadump_conf, opal_fdm); - return addr; + return dest_addr; } static u64 opal_fadump_get_metadata_size(void) -- cgit v1.2.3 From 2a1b06dd3a17ac278494da3c15cac84684346d22 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:21:59 +0530 Subject: powerpc/fadump: process the crashdump by exporting it as /proc/vmcore Add support in the kernel to process the crash'ed kernel's memory preserved during MPIPL and export it as /proc/vmcore file for the userland scripts to filter and analyze it later. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821351482.5656.6255805804744333073.stgit@hbathini.in.ibm.com --- arch/powerpc/platforms/powernv/opal-fadump.c | 147 ++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index a4b96a7a5d6c..5ace7c4d6bc1 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #include "opal-fadump.h" +static const struct opal_fadump_mem_struct *opal_fdm_active; static struct opal_fadump_mem_struct *opal_fdm; static int opal_fadump_unregister(struct fw_dump *fadump_conf); @@ -39,6 +41,37 @@ static void opal_fadump_update_config(struct fw_dump *fadump_conf, fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr; } +/* + * This function is called in the capture kernel to get configuration details + * from metadata setup by the first kernel. + */ +static void opal_fadump_get_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + int i; + + if (!fadump_conf->dump_active) + return; + + fadump_conf->boot_memory_size = 0; + + pr_debug("Boot memory regions:\n"); + for (i = 0; i < fdm->region_cnt; i++) { + pr_debug("\t%d. base: 0x%llx, size: 0x%llx\n", + (i + 1), fdm->rgn[i].src, fdm->rgn[i].size); + + fadump_conf->boot_memory_size += fdm->rgn[i].size; + } + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest; + + opal_fadump_update_config(fadump_conf, fdm); +} + /* Initialize kernel metadata */ static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) { @@ -209,24 +242,97 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf) pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); } +/* + * Convert CPU state data saved at the time of crash into ELF notes. + * + * Append crashing CPU's register data saved by the kernel in the PT_NOTE. + */ +static int __init +opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf, + struct fadump_crash_info_header *fdh) +{ + u32 num_cpus = 1, *note_buf; + int rc; + + if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) + return -ENODEV; + + /* Allocate CPU notes buffer to hold crashing cpu notes. */ + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); + final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + return 0; +} + static int __init opal_fadump_process(struct fw_dump *fadump_conf) { - return -EINVAL; + struct fadump_crash_info_header *fdh; + int rc = -EINVAL; + + if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) + return rc; + + /* Validate the fadump crash info header */ + fdh = __va(fadump_conf->fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + pr_err("Crash info header is not valid.\n"); + return rc; + } + + rc = opal_fadump_build_cpu_notes(fadump_conf, fdh); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return rc; } static void opal_fadump_region_show(struct fw_dump *fadump_conf, struct seq_file *m) { - const struct opal_fadump_mem_struct *fdm_ptr = opal_fdm; + const struct opal_fadump_mem_struct *fdm_ptr; u64 dumped_bytes = 0; int i; + if (fadump_conf->dump_active) + fdm_ptr = opal_fdm_active; + else + fdm_ptr = opal_fdm; + for (i = 0; i < fdm_ptr->region_cnt; i++) { + /* + * Only regions that are registered for MPIPL + * would have dump data. + */ + if ((fadump_conf->dump_active) && + (i < fdm_ptr->registered_regions)) + dumped_bytes = fdm_ptr->rgn[i].size; + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest); seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", fdm_ptr->rgn[i].size, dumped_bytes); } + + /* Dump is active. Show reserved area start address. */ + if (fadump_conf->dump_active) { + seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n", + fadump_conf->reserve_dump_area_start); + } } static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, @@ -257,7 +363,11 @@ static struct fadump_ops opal_fadump_ops = { void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) { + const __be32 *prop; unsigned long dn; + u64 addr = 0; + s64 ret; + /* * Check if Firmware-Assisted Dump is supported. if yes, check @@ -276,4 +386,37 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) fadump_conf->ops = &opal_fadump_ops; fadump_conf->fadump_supported = 1; + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get Kernel metadata (%lld)\n", ret); + return; + } + + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + + opal_fdm_active = __va(addr); + if (opal_fdm_active->version != OPAL_FADUMP_VERSION) { + pr_warn("Supported kernel metadata version: %u, found: %d!\n", + OPAL_FADUMP_VERSION, opal_fdm_active->version); + pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n"); + } + + /* Kernel regions not registered with f/w for MPIPL */ + if (opal_fdm_active->registered_regions == 0) { + opal_fdm_active = NULL; + return; + } + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + opal_fadump_get_config(fadump_conf, opal_fdm_active); } -- cgit v1.2.3 From 6071e8f9d5ac960f0d35495f070d4d0b7ae5fc76 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:22:32 +0530 Subject: powerpc/fadump: Warn before processing partial crashdump If all kernel boot memory regions are not registered for MPIPL before system crashes, try processing the partial crashdump but warn the user before proceeding. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821352793.5656.1734051341024721407.stgit@hbathini.in.ibm.com --- arch/powerpc/platforms/powernv/opal-fadump.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 5ace7c4d6bc1..533adae545d8 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -69,6 +69,30 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf, */ fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest; + /* + * Rarely, but it can so happen that system crashes before all + * boot memory regions are registered for MPIPL. In such + * cases, warn that the vmcore may not be accurate and proceed + * anyway as that is the best bet considering free pages, cache + * pages, user pages, etc are usually filtered out. + * + * Hope the memory that could not be preserved only has pages + * that are usually filtered out while saving the vmcore. + */ + if (fdm->region_cnt > fdm->registered_regions) { + pr_warn("Not all memory regions were saved!!!\n"); + pr_warn(" Unsaved memory regions:\n"); + i = fdm->registered_regions; + while (i < fdm->region_cnt) { + pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n", + i, fdm->rgn[i].src, fdm->rgn[i].size); + i++; + } + + pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n"); + pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n"); + } + opal_fadump_update_config(fadump_conf, fdm); } -- cgit v1.2.3 From a4e2e2ca2f7bddf6d5d788033cc56f40af6e9c5a Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:23:28 +0530 Subject: powerpc/fadump: handle invalidation of crashdump and re-registraion Make OPAL call to indicate that the dump is processed and the metadata area in OPAL can be cleared/released. Also, setup/initialize FADump for re-registration. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821356046.5656.12270927048195494911.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 8 +++++++- arch/powerpc/platforms/powernv/opal-fadump.c | 12 +++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index aab9b4db0363..852ac4761e90 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1063,7 +1063,13 @@ static void fadump_invalidate_release_mem(void) fadump_release_memory(fw_dump.boot_memory_size, memblock_end_of_DRAM()); fadump_free_cpu_notes_buf(); - /* Initialize the kernel dump memory structure for FAD registration. */ + /* + * Setup kernel metadata and initialize the kernel dump + * memory structure for FADump re-registration. + */ + if (fw_dump.ops->fadump_setup_metadata && + (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0)) + pr_warn("Failed to setup kernel metadata!\n"); fw_dump.ops->fadump_init_mem_struct(&fw_dump); } diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 533adae545d8..ef6fb1fc66cb 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -254,7 +254,17 @@ static int opal_fadump_unregister(struct fw_dump *fadump_conf) static int opal_fadump_invalidate(struct fw_dump *fadump_conf) { - return -EIO; + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0); + if (rc) { + pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + opal_fdm_active = NULL; + return 0; } static void opal_fadump_cleanup(struct fw_dump *fadump_conf) -- cgit v1.2.3 From fbcafdaea2e234d3c6d79e7f5605a2e8373f6678 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:23:53 +0530 Subject: powerpc/fadump: Update documentation about OPAL platform support With FADump support now available on both pseries and OPAL platforms, update FADump documentation with these details. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821361692.5656.11377757995827253404.stgit@hbathini.in.ibm.com --- Documentation/powerpc/firmware-assisted-dump.rst | 122 +++++++++++++---------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index d912755067d2..ab578ac64c09 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -46,10 +46,9 @@ as follows: These registered sections of memory are reserved by the first kernel during early boot. -- When a system crashes, the Power firmware will save - the low memory (boot memory of size larger of 5% of system RAM - or 256MB) of RAM to the previous registered region. It will - also save system registers, and hardware PTE's. +- When system crashes, the Power firmware will copy the registered + low memory regions (boot memory) from source to destination area. + It will also save hardware PTE's. NOTE: The term 'boot memory' means size of the low memory chunk @@ -61,9 +60,9 @@ as follows: the default calculated size. Use this option if default boot memory size is not sufficient for second kernel to boot successfully. For syntax of crashkernel= parameter, - refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is - provided in crashkernel= parameter, it will be ignored - as FADump uses a predefined offset to reserve memory + refer to Documentation/admin-guide/kdump/kdump.rst. If any + offset is provided in crashkernel= parameter, it will be + ignored as FADump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. - After the low memory (boot memory) area has been saved, the @@ -71,8 +70,9 @@ as follows: *not* clear the RAM. It will then launch the bootloader, as normal. -- The freshly booted kernel will notice that there is a new - node (ibm,dump-kernel) in the device tree, indicating that +- The freshly booted kernel will notice that there is a new node + (rtas/ibm,kernel-dump on pSeries or ibm,opal/dump/mpipl-boot + on OPAL platform) in the device tree, indicating that there is crash data available from a previous boot. During the early boot OS will reserve rest of the memory above boot memory size effectively booting with restricted memory @@ -95,8 +95,11 @@ as follows: # echo 1 > /sys/kernel/fadump_release_mem Please note that the firmware-assisted dump feature -is only available on Power6 and above systems with recent -firmware versions. +is only available on POWER6 and above systems on pSeries +(PowerVM) platform and POWER9 and above systems with OP940 +or later firmware versions on PowerNV (OPAL) platform. +Note that, OPAL firmware exports ibm,opal/dump node when +FADump is supported on PowerNV platform. Implementation details: ----------------------- @@ -111,57 +114,76 @@ that are run. If there is dump data, then the /sys/kernel/fadump_release_mem file is created, and the reserved memory is held. -If there is no waiting dump data, then only the memory required -to hold CPU state, HPTE region, boot memory dump and elfcore -header, is usually reserved at an offset greater than boot memory -size (see Fig. 1). This area is *not* released: this region will -be kept permanently reserved, so that it can act as a receptacle -for a copy of the boot memory content in addition to CPU state -and HPTE region, in the case a crash does occur. Since this reserved -memory area is used only after the system crash, there is no point in -blocking this significant chunk of memory from production kernel. -Hence, the implementation uses the Linux kernel's Contiguous Memory -Allocator (CMA) for memory reservation if CMA is configured for kernel. -With CMA reservation this memory will be available for applications to -use it, while kernel is prevented from using it. With this FADump will -still be able to capture all of the kernel memory and most of the user -space memory except the user pages that were present in CMA region:: +If there is no waiting dump data, then only the memory required to +hold CPU state, HPTE region, boot memory dump, FADump header and +elfcore header, is usually reserved at an offset greater than boot +memory size (see Fig. 1). This area is *not* released: this region +will be kept permanently reserved, so that it can act as a receptacle +for a copy of the boot memory content in addition to CPU state and +HPTE region, in the case a crash does occur. + +Since this reserved memory area is used only after the system crash, +there is no point in blocking this significant chunk of memory from +production kernel. Hence, the implementation uses the Linux kernel's +Contiguous Memory Allocator (CMA) for memory reservation if CMA is +configured for kernel. With CMA reservation this memory will be +available for applications to use it, while kernel is prevented from +using it. With this FADump will still be able to capture all of the +kernel memory and most of the user space memory except the user pages +that were present in CMA region:: o Memory Reservation during first kernel - Low memory Top of memory - 0 boot memory size |<--Reserved dump area --->| | - | | | (Permanent Reservation) | | - V V | | V - +-----------+----------/ /---+---+----+--------+---+----+------+ - | | |CPU|HPTE| DUMP |HDR|ELF | | - +-----------+----------/ /---+---+----+--------+---+----+------+ - | ^ ^ - | | | - \ / | - ----------------------------------- FADump Header - Boot memory content gets transferred (meta area) - to reserved area by firmware at the - time of crash + Low memory Top of memory + 0 boot memory size |<--- Reserved dump area --->| | + | | | Permanent Reservation | | + V V | | V + +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ + | | |///|////| DUMP | HDR | ELF |////| | + +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ + | ^ ^ ^ ^ ^ + | | | | | | + \ CPU HPTE / | | + ------------------------------ | | + Boot memory content gets transferred | | + to reserved area by firmware at the | | + time of crash. | | + FADump Header | + (meta area) | + | + | + Metadata: This area holds a metadata struture whose + address is registered with f/w and retrieved in the + second kernel after crash, on platforms that support + tags (OPAL). Having such structure with info needed + to process the crashdump eases dump capture process. Fig. 1 o Memory Reservation during second kernel after crash - Low memory Top of memory - 0 boot memory size | - | |<----------- Crash preserved area --------------->| - V V |<-- Reserved dump area -->| V - +-----------+----------/ /---+---+----+--------+---+----+------+ - | | |CPU|HPTE| DUMP |HDR|ELF | | - +-----------+----------/ /---+---+----+--------+---+----+------+ - | | - V V - Used by second /proc/vmcore + Low memory Top of memory + 0 boot memory size | + | |<------------ Crash preserved area ------------>| + V V |<--- Reserved dump area --->| | + +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ + | | |///|////| DUMP | HDR | ELF |////| | + +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ + | | + V V + Used by second /proc/vmcore kernel to boot + + +---+ + |///| -> Regions (CPU, HPTE & Metadata) marked like this in the above + +---+ figures are not always present. For example, OPAL platform + does not have CPU & HPTE regions while Metadata region is + not supported on pSeries currently. + Fig. 2 + Currently the dump will be copied from /proc/vmcore to a new file upon user intervention. The dump data available through /proc/vmcore will be in ELF format. Hence the existing kdump infrastructure (kdump scripts) -- cgit v1.2.3 From 579ca1a27675485a99da50cd7fedc14232f817c3 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:24:28 +0530 Subject: powerpc/fadump: make use of memblock's bottom up allocation mode Earlier, memblock_find_in_range() was not used to find the memory to be reserved for FADump as bottom up allocation mode was not supported. But since commit 79442ed189acb8b ("mm/memblock.c: introduce bottom-up allocation mode") bottom up allocation mode is supported for memblock. So, use it to find the memory to be reserved for FADump. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821364211.5656.14336025460336135194.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 852ac4761e90..da751402c649 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -342,7 +342,8 @@ static void __init fadump_reserve_crash_area(unsigned long base, int __init fadump_reserve_mem(void) { - u64 base, size, mem_boundary; + bool is_memblock_bottom_up = memblock_bottom_up(); + u64 base, size, mem_boundary, align = PAGE_SIZE; int ret = 1; if (!fw_dump.fadump_enabled) @@ -362,10 +363,11 @@ int __init fadump_reserve_mem(void) fw_dump.boot_memory_size = PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA - if (!fw_dump.nocma) + if (!fw_dump.nocma) { + align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, - FADUMP_CMA_ALIGNMENT); + ALIGN(fw_dump.boot_memory_size, align); + } #endif } @@ -419,19 +421,15 @@ int __init fadump_reserve_mem(void) } else { /* * Reserve memory at an offset closer to bottom of the RAM to - * minimize the impact of memory hot-remove operation. We can't - * use memblock_find_in_range() here since it doesn't allocate - * from bottom to top. + * minimize the impact of memory hot-remove operation. */ - while (base <= (mem_boundary - size)) { - if (memblock_is_region_memory(base, size) && - !memblock_is_region_reserved(base, size)) - break; + memblock_set_bottom_up(true); + base = memblock_find_in_range(base, mem_boundary, size, align); - base += size; - } + /* Restore the previous allocation mode */ + memblock_set_bottom_up(is_memblock_bottom_up); - if (base > (mem_boundary - size)) { + if (!base) { pr_err("Failed to find memory chunk for reservation!\n"); goto error_out; } -- cgit v1.2.3 From 5000a17afbd56b4e58b7f72ac77eabb92c418412 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:24:50 +0530 Subject: powerpc/fadump: process architected register state data provided by firmware Firmware provides architected register state data at the time of crash. Process this data and build CPU notes to append to ELF core. In case this data is missing or in unsupported format, at least append crashing CPU's register data, to have something to work with in the vmcore file. Signed-off-by: Hari Bathini Signed-off-by: Vasant Hegde Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821367702.5656.5546683836236508389.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 4 + arch/powerpc/platforms/powernv/opal-fadump.c | 209 ++++++++++++++++++++++++++- arch/powerpc/platforms/powernv/opal-fadump.h | 41 ++++++ 3 files changed, 247 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 0c2c4f58d365..2c72685dfff5 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -88,6 +88,10 @@ struct fw_dump { unsigned long reserve_bootvar; unsigned long cpu_state_data_size; + u64 cpu_state_dest_vaddr; + u32 cpu_state_data_version; + u32 cpu_state_entry_size; + unsigned long hpte_region_size; unsigned long boot_memory_size; diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index ef6fb1fc66cb..ad4c18bd8809 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -21,6 +21,7 @@ #include "opal-fadump.h" static const struct opal_fadump_mem_struct *opal_fdm_active; +static const struct opal_mpipl_fadump *opal_cpu_metadata; static struct opal_fadump_mem_struct *opal_fdm; static int opal_fadump_unregister(struct fw_dump *fadump_conf); @@ -276,28 +277,207 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf) pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); } +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, + u32 reg_type, u32 reg_num, + u64 reg_val) +{ + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case HDAT_FADUMP_REG_ID_NIP: + regs->nip = reg_val; + break; + case HDAT_FADUMP_REG_ID_MSR: + regs->msr = reg_val; + break; + case HDAT_FADUMP_REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, + unsigned int reg_entry_size, + struct pt_regs *regs) +{ + struct hdat_fadump_reg_entry *reg_entry; + int i; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct hdat_fadump_reg_entry *)bufp; + opal_fadump_set_regval_regnum(regs, + be32_to_cpu(reg_entry->reg_type), + be32_to_cpu(reg_entry->reg_num), + be64_to_cpu(reg_entry->reg_val)); + } +} + +/* + * Verify if CPU state data is available. If available, do a bit of sanity + * checking before processing this data. + */ +static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf) +{ + if (!opal_cpu_metadata) + return false; + + fadump_conf->cpu_state_data_version = + be32_to_cpu(opal_cpu_metadata->cpu_data_version); + fadump_conf->cpu_state_entry_size = + be32_to_cpu(opal_cpu_metadata->cpu_data_size); + fadump_conf->cpu_state_dest_vaddr = + (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest)); + fadump_conf->cpu_state_data_size = + be64_to_cpu(opal_cpu_metadata->region[0].size); + + if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU state data version: %u, found: %d!\n", + HDAT_FADUMP_CPU_DATA_VER, + fadump_conf->cpu_state_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + if ((fadump_conf->cpu_state_dest_vaddr == 0) || + (fadump_conf->cpu_state_entry_size == 0) || + (fadump_conf->cpu_state_entry_size > + fadump_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid. Ignoring!\n"); + return false; + } + + return true; +} + /* * Convert CPU state data saved at the time of crash into ELF notes. * - * Append crashing CPU's register data saved by the kernel in the PT_NOTE. + * While the crashing CPU's register data is saved by the kernel, CPU state + * data for all CPUs is saved by f/w. In CPU state data provided by f/w, + * each register entry is of 16 bytes, a numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. If this data is + * missing or in unsupported format, append crashing CPU's register data + * saved by the kernel in the PT_NOTE, to have something to work with in + * the vmcore file. */ static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf, struct fadump_crash_info_header *fdh) { + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + bool is_cpu_data_valid = false; u32 num_cpus = 1, *note_buf; - int rc; - - if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) - return -ENODEV; + struct pt_regs regs; + char *bufp; + int rc, i; + + if (is_opal_fadump_cpu_data_valid(fadump_conf)) { + size_per_thread = fadump_conf->cpu_state_entry_size; + num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread); + bufp = __va(fadump_conf->cpu_state_dest_vaddr); + is_cpu_data_valid = true; + } - /* Allocate CPU notes buffer to hold crashing cpu notes. */ rc = fadump_setup_cpu_notes_buf(num_cpus); if (rc != 0) return rc; note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; - note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); + if (!is_cpu_data_valid) + goto out; + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + for (i = 0; i < num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + + thread_pir = be32_to_cpu(thdr->pir); + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * If this is kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu == thread_pir) { + note_buf = fadump_regs_to_elf_notes(note_buf, + &fdh->regs); + pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + fdh->crashing_cpu, fdh->regs.gpr[1], + fdh->regs.nip); + continue; + } + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + thread_pir, regs.gpr[1], regs.nip); + } + +out: + /* + * CPU state data is invalid/unsupported. Try appending crashing CPU's + * register data, if it is saved by the kernel. + */ + if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) { + if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) { + fadump_free_cpu_notes_buf(); + return -ENODEV; + } + + pr_warn("WARNING: appending only crashing CPU's register data\n"); + note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); + } + final_note(note_buf); pr_debug("Updating elfcore header (%llx) with cpu notes\n", @@ -374,6 +554,14 @@ static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, { int rc; + /* + * Unlike on pSeries platform, logical CPU number is not provided + * with architected register state data. So, store the crashing + * CPU's PIR instead to plug the appropriate register data for + * crashing CPU in the vmcore file. + */ + fdh->crashing_cpu = (u32)mfspr(SPRN_PIR); + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg); if (rc == OPAL_UNSUPPORTED) { pr_emerg("Reboot type %d not supported.\n", @@ -450,6 +638,13 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) return; } + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if (addr) { + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opal_cpu_metadata = __va(addr); + } + pr_info("Firmware-assisted dump is active.\n"); fadump_conf->dump_active = 1; opal_fadump_get_config(fadump_conf, opal_fdm_active); diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h index 0b83d895485c..c0b5d6e42ba6 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.h +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -36,4 +36,45 @@ struct opal_fadump_mem_struct { struct opal_mpipl_region rgn[OPAL_FADUMP_MAX_MEM_REGS]; } __packed; +/* + * CPU state data + * + * CPU state data information is provided by f/w. The format for this data + * is defined in the HDAT spec. Version is used to keep a tab on the changes + * in this CPU state data format. Changes to this format are unlikely, but + * if there are any changes, please refer to latest HDAT specification. + */ +#define HDAT_FADUMP_CPU_DATA_VER 1 + +#define HDAT_FADUMP_CORE_INACTIVE (0x0F) + +/* HDAT thread header for register entries */ +struct hdat_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __packed; + +/* Register types populated by f/w */ +#define HDAT_FADUMP_REG_TYPE_GPR 0x01 +#define HDAT_FADUMP_REG_TYPE_SPR 0x02 + +/* ID numbers used by f/w while populating certain registers */ +#define HDAT_FADUMP_REG_ID_NIP 0x7D0 +#define HDAT_FADUMP_REG_ID_MSR 0x7D1 +#define HDAT_FADUMP_REG_ID_CCR 0x7D2 + +/* HDAT register entry. */ +struct hdat_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +} __packed; + #endif /* _POWERNV_OPAL_FADUMP_H */ -- cgit v1.2.3 From e4fc48fb4d34f7e7d42eb980a9c130bb93aba3b9 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:25:05 +0530 Subject: powerpc/fadump: make crash memory ranges array allocation generic Make allocate_crash_memory_ranges() and free_crash_memory_ranges() functions generic to reuse them for memory management of all types of dynamic memory range arrays. This change helps in memory management of reserved ranges array to be added later. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821369863.5656.4375667005352155892.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 15 +++- arch/powerpc/kernel/fadump.c | 113 +++++++++++++++-------------- 2 files changed, 72 insertions(+), 56 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 2c72685dfff5..0f2ce85ee7b9 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -72,9 +72,18 @@ struct fadump_crash_info_header { struct cpumask online_mask; }; -struct fad_crash_memory_ranges { - unsigned long long base; - unsigned long long size; +struct fadump_memory_range { + u64 base; + u64 size; +}; + +/* fadump memory ranges info */ +struct fadump_mrange_info { + char name[16]; + struct fadump_memory_range *mem_ranges; + u32 mem_ranges_sz; + u32 mem_range_cnt; + u32 max_mem_ranges; }; /* Platform specific callback functions */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index da751402c649..f95ec1fd797a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -35,10 +35,7 @@ static struct fw_dump fw_dump; static DEFINE_MUTEX(fadump_mutex); -struct fad_crash_memory_ranges *crash_memory_ranges; -int crash_memory_ranges_size; -int crash_mem_ranges; -int max_crash_mem_ranges; +struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -629,46 +626,48 @@ void fadump_free_cpu_notes_buf(void) fw_dump.cpu_notes_buf_size = 0; } -static void free_crash_memory_ranges(void) +static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { - kfree(crash_memory_ranges); - crash_memory_ranges = NULL; - crash_memory_ranges_size = 0; - max_crash_mem_ranges = 0; + kfree(mrange_info->mem_ranges); + mrange_info->mem_ranges = NULL; + mrange_info->mem_ranges_sz = 0; + mrange_info->max_mem_ranges = 0; } /* - * Allocate or reallocate crash memory ranges array in incremental units + * Allocate or reallocate mem_ranges array in incremental units * of PAGE_SIZE. */ -static int allocate_crash_memory_ranges(void) +static int fadump_alloc_mem_ranges(struct fadump_mrange_info *mrange_info) { - struct fad_crash_memory_ranges *new_array; + struct fadump_memory_range *new_array; u64 new_size; - new_size = crash_memory_ranges_size + PAGE_SIZE; - pr_debug("Allocating %llu bytes of memory for crash memory ranges\n", - new_size); + new_size = mrange_info->mem_ranges_sz + PAGE_SIZE; + pr_debug("Allocating %llu bytes of memory for %s memory ranges\n", + new_size, mrange_info->name); - new_array = krealloc(crash_memory_ranges, new_size, GFP_KERNEL); + new_array = krealloc(mrange_info->mem_ranges, new_size, GFP_KERNEL); if (new_array == NULL) { - pr_err("Insufficient memory for setting up crash memory ranges\n"); - free_crash_memory_ranges(); + pr_err("Insufficient memory for setting up %s memory ranges\n", + mrange_info->name); + fadump_free_mem_ranges(mrange_info); return -ENOMEM; } - crash_memory_ranges = new_array; - crash_memory_ranges_size = new_size; - max_crash_mem_ranges = (new_size / - sizeof(struct fad_crash_memory_ranges)); + mrange_info->mem_ranges = new_array; + mrange_info->mem_ranges_sz = new_size; + mrange_info->max_mem_ranges = (new_size / + sizeof(struct fadump_memory_range)); return 0; } -static inline int fadump_add_crash_memory(unsigned long long base, - unsigned long long end) +static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, + u64 base, u64 end) { - u64 start, size; + struct fadump_memory_range *mem_ranges = mrange_info->mem_ranges; bool is_adjacent = false; + u64 start, size; if (base == end) return 0; @@ -677,38 +676,41 @@ static inline int fadump_add_crash_memory(unsigned long long base, * Fold adjacent memory ranges to bring down the memory ranges/ * PT_LOAD segments count. */ - if (crash_mem_ranges) { - start = crash_memory_ranges[crash_mem_ranges - 1].base; - size = crash_memory_ranges[crash_mem_ranges - 1].size; + if (mrange_info->mem_range_cnt) { + start = mem_ranges[mrange_info->mem_range_cnt - 1].base; + size = mem_ranges[mrange_info->mem_range_cnt - 1].size; if ((start + size) == base) is_adjacent = true; } if (!is_adjacent) { /* resize the array on reaching the limit */ - if (crash_mem_ranges == max_crash_mem_ranges) { + if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; - ret = allocate_crash_memory_ranges(); + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; + + /* Update to the new resized array */ + mem_ranges = mrange_info->mem_ranges; } start = base; - crash_memory_ranges[crash_mem_ranges].base = start; - crash_mem_ranges++; + mem_ranges[mrange_info->mem_range_cnt].base = start; + mrange_info->mem_range_cnt++; } - crash_memory_ranges[crash_mem_ranges - 1].size = (end - start); - pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", - (crash_mem_ranges - 1), start, end - 1, (end - start)); + mem_ranges[mrange_info->mem_range_cnt - 1].size = (end - start); + pr_debug("%s_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", + mrange_info->name, (mrange_info->mem_range_cnt - 1), + start, end - 1, (end - start)); return 0; } -static int fadump_exclude_reserved_area(unsigned long long start, - unsigned long long end) +static int fadump_exclude_reserved_area(u64 start, u64 end) { - unsigned long long ra_start, ra_end; + u64 ra_start, ra_end; int ret = 0; ra_start = fw_dump.reserve_dump_area_start; @@ -716,18 +718,22 @@ static int fadump_exclude_reserved_area(unsigned long long start, if ((ra_start < end) && (ra_end > start)) { if ((start < ra_start) && (end > ra_end)) { - ret = fadump_add_crash_memory(start, ra_start); + ret = fadump_add_mem_range(&crash_mrange_info, + start, ra_start); if (ret) return ret; - ret = fadump_add_crash_memory(ra_end, end); + ret = fadump_add_mem_range(&crash_mrange_info, + ra_end, end); } else if (start < ra_start) { - ret = fadump_add_crash_memory(start, ra_start); + ret = fadump_add_mem_range(&crash_mrange_info, + start, ra_start); } else if (ra_end < end) { - ret = fadump_add_crash_memory(ra_end, end); + ret = fadump_add_mem_range(&crash_mrange_info, + ra_end, end); } } else - ret = fadump_add_crash_memory(start, end); + ret = fadump_add_mem_range(&crash_mrange_info, start, end); return ret; } @@ -772,11 +778,11 @@ static int fadump_init_elfcore_header(char *bufp) static int fadump_setup_crash_memory_ranges(void) { struct memblock_region *reg; - unsigned long long start, end; + u64 start, end; int ret; pr_debug("Setup crash memory ranges.\n"); - crash_mem_ranges = 0; + crash_mrange_info.mem_range_cnt = 0; /* * add the first memory chunk (RMA_START through boot_memory_size) as @@ -785,13 +791,14 @@ static int fadump_setup_crash_memory_ranges(void) * specified during fadump registration. We need to create a separate * program header for this chunk with the correct offset. */ - ret = fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); + ret = fadump_add_mem_range(&crash_mrange_info, + RMA_START, fw_dump.boot_memory_size); if (ret) return ret; for_each_memblock(memory, reg) { - start = (unsigned long long)reg->base; - end = start + (unsigned long long)reg->size; + start = (u64)reg->base; + end = start + (u64)reg->size; /* * skip the first memory chunk that is already added (RMA_START @@ -876,11 +883,11 @@ static int fadump_create_elfcore_headers(char *bufp) /* setup PT_LOAD sections. */ - for (i = 0; i < crash_mem_ranges; i++) { - unsigned long long mbase, msize; - mbase = crash_memory_ranges[i].base; - msize = crash_memory_ranges[i].size; + for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) { + u64 mbase, msize; + mbase = crash_mrange_info.mem_ranges[i].base; + msize = crash_mrange_info.mem_ranges[i].size; if (!msize) continue; @@ -973,7 +980,7 @@ void fadump_cleanup(void) } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ fw_dump.ops->fadump_unregister(&fw_dump); - free_crash_memory_ranges(); + fadump_free_mem_ranges(&crash_mrange_info); } if (fw_dump.ops->fadump_cleanup) -- cgit v1.2.3 From dda9dbfeeb7a855a75965b8ba7269f4edb35cde7 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:25:36 +0530 Subject: powerpc/fadump: consider reserved ranges while releasing memory Commit 0962e8004e97 ("powerpc/prom: Scan reserved-ranges node for memory reservations") enabled support to parse 'reserved-ranges' DT node to reserve kernel memory falling in these ranges for firmware purposes. Along with the preserved area memory, ensure memory in reserved ranges is not overlapped with memory released by capture kernel aftering saving vmcore. Also, fix the off-by-one error in fadump_release_reserved_area function while releasing memory. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821371358.5656.6061214942558818661.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 167 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 146 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index f95ec1fd797a..502e49ab4b98 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -36,6 +36,7 @@ static struct fw_dump fw_dump; static DEFINE_MUTEX(fadump_mutex); struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; +struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; #ifdef CONFIG_CMA static struct cma *fadump_cma; @@ -1009,49 +1010,173 @@ static void fadump_free_reserved_memory(unsigned long start_pfn, /* * Skip memory holes and free memory that was actually reserved. */ -static void fadump_release_reserved_area(unsigned long start, unsigned long end) +static void fadump_release_reserved_area(u64 start, u64 end) { + u64 tstart, tend, spfn, epfn; struct memblock_region *reg; - unsigned long tstart, tend; - unsigned long start_pfn = PHYS_PFN(start); - unsigned long end_pfn = PHYS_PFN(end); + spfn = PHYS_PFN(start); + epfn = PHYS_PFN(end); for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg)); + tend = min_t(u64, epfn, memblock_region_memory_end_pfn(reg)); if (tstart < tend) { fadump_free_reserved_memory(tstart, tend); - if (tend == end_pfn) + if (tend == epfn) break; - start_pfn = tend + 1; + spfn = tend; } } } /* - * Release the memory that was reserved in early boot to preserve the memory - * contents. The released memory will be available for general use. + * Sort the mem ranges in-place and merge adjacent ranges + * to minimize the memory ranges count. */ -static void fadump_release_memory(unsigned long begin, unsigned long end) +static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) { - unsigned long ra_start, ra_end; + struct fadump_memory_range *mem_ranges; + struct fadump_memory_range tmp_range; + u64 base, size; + int i, j, idx; + + if (!reserved_mrange_info.mem_range_cnt) + return; + + /* Sort the memory ranges */ + mem_ranges = mrange_info->mem_ranges; + for (i = 0; i < mrange_info->mem_range_cnt; i++) { + idx = i; + for (j = (i + 1); j < mrange_info->mem_range_cnt; j++) { + if (mem_ranges[idx].base > mem_ranges[j].base) + idx = j; + } + if (idx != i) { + tmp_range = mem_ranges[idx]; + mem_ranges[idx] = mem_ranges[i]; + mem_ranges[i] = tmp_range; + } + } + + /* Merge adjacent reserved ranges */ + idx = 0; + for (i = 1; i < mrange_info->mem_range_cnt; i++) { + base = mem_ranges[i-1].base; + size = mem_ranges[i-1].size; + if (mem_ranges[i].base == (base + size)) + mem_ranges[idx].size += mem_ranges[i].size; + else { + idx++; + if (i == idx) + continue; + + mem_ranges[idx] = mem_ranges[i]; + } + } + mrange_info->mem_range_cnt = idx + 1; +} + +/* + * Scan reserved-ranges to consider them while reserving/releasing + * memory for FADump. + */ +static inline int fadump_scan_reserved_mem_ranges(void) +{ + struct device_node *root; + const __be32 *prop; + int len, ret = -1; + unsigned long i; + + root = of_find_node_by_path("/"); + if (!root) + return ret; + + prop = of_get_property(root, "reserved-ranges", &len); + if (!prop) + return ret; + + /* + * Each reserved range is an (address,size) pair, 2 cells each, + * totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, size; + + base = of_read_number(prop + (i * 4) + 0, 2); + size = of_read_number(prop + (i * 4) + 2, 2); + + if (size) { + ret = fadump_add_mem_range(&reserved_mrange_info, + base, base + size); + if (ret < 0) { + pr_warn("some reserved ranges are ignored!\n"); + break; + } + } + } + + return ret; +} + +/* + * Release the memory that was reserved during early boot to preserve the + * crash'ed kernel's memory contents except reserved dump area (permanent + * reservation) and reserved ranges used by F/W. The released memory will + * be available for general use. + */ +static void fadump_release_memory(u64 begin, u64 end) +{ + u64 ra_start, ra_end, tstart; + int i, ret; + + fadump_scan_reserved_mem_ranges(); ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. + * Add reserved dump area to reserved ranges list + * and exclude all these ranges while releasing memory. */ - if (begin < ra_end && end > ra_start) { - if (begin < ra_start) - fadump_release_reserved_area(begin, ra_start); - if (end > ra_end) - fadump_release_reserved_area(ra_end, end); - } else - fadump_release_reserved_area(begin, end); + ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end); + if (ret != 0) { + /* + * Not enough memory to setup reserved ranges but the system is + * running shortage of memory. So, release all the memory except + * Reserved dump area (reused for next fadump registration). + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); + + return; + } + + /* Get the reserved ranges list in order first. */ + sort_and_merge_mem_ranges(&reserved_mrange_info); + + /* Exclude reserved ranges and release remaining memory */ + tstart = begin; + for (i = 0; i < reserved_mrange_info.mem_range_cnt; i++) { + ra_start = reserved_mrange_info.mem_ranges[i].base; + ra_end = ra_start + reserved_mrange_info.mem_ranges[i].size; + + if (tstart >= ra_end) + continue; + + if (tstart < ra_start) + fadump_release_reserved_area(tstart, ra_start); + tstart = ra_end; + } + + if (tstart < end) + fadump_release_reserved_area(tstart, end); } static void fadump_invalidate_release_mem(void) -- cgit v1.2.3 From b2a815a554a34f0e6fab4526ae762d5528783600 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:25:49 +0530 Subject: powerpc/fadump: improve how crashed kernel's memory is reserved The size parameter to fadump_reserve_crash_area() function is not needed as all the memory above boot memory size must be preserved anyway. Update the function by dropping this redundant parameter. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821374440.5656.2945512543806951766.stgit@hbathini.in.ibm.com --- arch/powerpc/kernel/fadump.c | 53 ++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 502e49ab4b98..6f52a60bc212 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -34,6 +34,8 @@ static struct fw_dump fw_dump; +static void __init fadump_reserve_crash_area(u64 base); + static DEFINE_MUTEX(fadump_mutex); struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; @@ -318,26 +320,6 @@ static unsigned long get_fadump_area_size(void) return size; } -static void __init fadump_reserve_crash_area(unsigned long base, - unsigned long size) -{ - struct memblock_region *reg; - unsigned long mstart, mend, msize; - - for_each_memblock(memory, reg) { - mstart = max_t(unsigned long, base, reg->base); - mend = reg->base + reg->size; - mend = min(base + size, mend); - - if (mstart < mend) { - msize = mend - mstart; - memblock_reserve(mstart, msize); - pr_info("Reserved %ldMB of memory at %#016lx for saving crash dump\n", - (msize >> 20), mstart); - } - } -} - int __init fadump_reserve_mem(void) { bool is_memblock_bottom_up = memblock_bottom_up(); @@ -406,12 +388,11 @@ int __init fadump_reserve_mem(void) #endif /* * If last boot has crashed then reserve all the memory - * above boot_memory_size so that we don't touch it until + * above boot memory size so that we don't touch it until * dump is written to disk by userspace tool. This memory - * will be released for general use once the dump is saved. + * can be released for general use by invalidating fadump. */ - size = mem_boundary - base; - fadump_reserve_crash_area(base, size); + fadump_reserve_crash_area(base); pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr); pr_debug("Reserve dump area start address: 0x%lx\n", @@ -1377,3 +1358,27 @@ int __init setup_fadump(void) return 1; } subsys_initcall(setup_fadump); + +/* Preserve everything above the base address */ +static void __init fadump_reserve_crash_area(u64 base) +{ + struct memblock_region *reg; + u64 mstart, msize; + + for_each_memblock(memory, reg) { + mstart = reg->base; + msize = reg->size; + + if ((mstart + msize) < base) + continue; + + if (mstart < base) { + msize -= (base - mstart); + mstart = base; + } + + pr_info("Reserving %lluMB of memory at %#016llx for preserving crash data", + (msize >> 20), mstart); + memblock_reserve(mstart, msize); + } +} -- cgit v1.2.3 From bec53196adf4791d466adf0e339b61186c7b5283 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:26:03 +0530 Subject: powerpc/fadump: add support to preserve crash data on FADUMP disabled kernel Add a new kernel config option, CONFIG_PRESERVE_FA_DUMP that ensures that crash data, from previously crash'ed kernel, is preserved. This helps in cases where FADump is not enabled but the subsequent memory preserving kernel boot is likely to process this crash data. One typical usecase for this config option is petitboot kernel. As OPAL allows registering address with it in the first kernel and retrieving it after MPIPL, use it to store the top of boot memory. A kernel that intends to preserve crash data retrieves it and avoids using memory beyond this address. Move arch_reserved_kernel_pages() function as it is needed for both FA_DUMP and PRESERVE_FA_DUMP configurations. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821375751.5656.11459483669542541602.stgit@hbathini.in.ibm.com --- arch/powerpc/Kconfig | 9 ++++ arch/powerpc/include/asm/fadump-internal.h | 11 +++++ arch/powerpc/include/asm/fadump.h | 9 ++-- arch/powerpc/kernel/Makefile | 4 +- arch/powerpc/kernel/fadump.c | 44 +++++++++++++++-- arch/powerpc/kernel/prom.c | 4 +- arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/opal-fadump.c | 73 ++++++++++++++++++++++++++++ 8 files changed, 144 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 375ff893fa60..62c483133953 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -583,6 +583,15 @@ config FA_DUMP If unsure, say "y". Only special kernels like petitboot may need to say "N" here. +config PRESERVE_FA_DUMP + bool "Preserve Firmware-assisted dump" + depends on PPC64 && PPC_POWERNV && !FA_DUMP + help + On a kernel with FA_DUMP disabled, this option helps to preserve + crash data from a previously crash'ed kernel. Useful when the next + memory preserving kernel boot would process this crash data. + Petitboot kernel is the typical usecase for this option. + config IRQ_ALL_CPUS bool "Distribute interrupts on all CPUs by default" depends on SMP diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 0f2ce85ee7b9..6c2868d38bd3 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -9,6 +9,7 @@ #ifndef _ASM_POWERPC_FADUMP_INTERNAL_H #define _ASM_POWERPC_FADUMP_INTERNAL_H +#ifndef CONFIG_PRESERVE_FA_DUMP /* * The RMA region will be saved for later dumping when kernel crashes. * RMA is Real Mode Area, the first block of logical memory address owned @@ -146,6 +147,16 @@ void fadump_update_elfcore_header(char *bufp); bool is_fadump_boot_mem_contiguous(void); bool is_fadump_reserved_mem_contiguous(void); +#else /* !CONFIG_PRESERVE_FA_DUMP */ + +/* Firmware-assisted dump configuration details. */ +struct fw_dump { + u64 boot_mem_top; + u64 dump_active; +}; + +#endif /* CONFIG_PRESERVE_FA_DUMP */ + #ifdef CONFIG_PPC_PSERIES extern void rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node); #else diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index fd5002b4c18c..526a6a647312 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -14,9 +14,6 @@ extern int crashing_cpu; extern int is_fadump_memory_area(u64 addr, ulong size); -extern int early_init_dt_scan_fw_dump(unsigned long node, - const char *uname, int depth, void *data); -extern int fadump_reserve_mem(void); extern int setup_fadump(void); extern int is_fadump_active(void); extern int should_fadump_crash(void); @@ -29,4 +26,10 @@ static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } static inline void fadump_cleanup(void) { } #endif /* !CONFIG_FA_DUMP */ + +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) +extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data); +extern int fadump_reserve_mem(void); +#endif #endif /* _ASM_POWERPC_FADUMP_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 66c54443187d..21ab769e8530 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -79,7 +79,9 @@ obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ eeh_driver.o eeh_event.o eeh_sysfs.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o -obj-$(CONFIG_FA_DUMP) += fadump.o +ifneq ($(CONFIG_FA_DUMP)$(CONFIG_PRESERVE_FA_DUMP),) +obj-y += fadump.o +endif ifdef CONFIG_PPC32 obj-$(CONFIG_E500) += idle_e500.o endif diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 6f52a60bc212..645d9d4d9332 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -36,6 +36,7 @@ static struct fw_dump fw_dump; static void __init fadump_reserve_crash_area(u64 base); +#ifndef CONFIG_PRESERVE_FA_DUMP static DEFINE_MUTEX(fadump_mutex); struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; @@ -439,11 +440,6 @@ error_out: return 0; } -unsigned long __init arch_reserved_kernel_pages(void) -{ - return memblock_reserved_size() / PAGE_SIZE; -} - /* Look for fadump= cmdline option. */ static int __init early_fadump_param(char *p) { @@ -1358,6 +1354,39 @@ int __init setup_fadump(void) return 1; } subsys_initcall(setup_fadump); +#else /* !CONFIG_PRESERVE_FA_DUMP */ + +/* Scan the Firmware Assisted dump configuration details. */ +int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data) +{ + if ((depth != 1) || (strcmp(uname, "ibm,opal") != 0)) + return 0; + + opal_fadump_dt_scan(&fw_dump, node); + return 1; +} + +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * preserve crash data. The subsequent memory preserving kernel boot + * is likely to process this crash data. + */ +int __init fadump_reserve_mem(void) +{ + if (fw_dump.dump_active) { + /* + * If last boot has crashed then reserve all the memory + * above boot memory to preserve crash data. + */ + pr_info("Preserving crash data for processing in next boot.\n"); + fadump_reserve_crash_area(fw_dump.boot_mem_top); + } else + pr_debug("FADump-aware kernel..\n"); + + return 1; +} +#endif /* CONFIG_PRESERVE_FA_DUMP */ /* Preserve everything above the base address */ static void __init fadump_reserve_crash_area(u64 base) @@ -1382,3 +1411,8 @@ static void __init fadump_reserve_crash_area(u64 base) memblock_reserve(mstart, msize); } } + +unsigned long __init arch_reserved_kernel_pages(void) +{ + return memblock_reserved_size() / PAGE_SIZE; +} diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 5828f1c81dc9..6620f37abe73 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -708,7 +708,7 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_ultravisor, NULL); #endif -#ifdef CONFIG_FA_DUMP +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) /* scan tree to see if dump is active during last boot */ of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); #endif @@ -735,7 +735,7 @@ void __init early_init_devtree(void *params) if (PHYSICAL_START > MEMORY_START) memblock_reserve(MEMORY_START, 0x8000); reserve_kdump_trampoline(); -#ifdef CONFIG_FA_DUMP +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) /* * If we fail to reserve memory for firmware-assisted dump then * fallback to kexec based kdump. diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 3e6d6d6d1ec1..2d13e6462339 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -8,6 +8,7 @@ obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index ad4c18bd8809..c66fa96915ae 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -20,6 +20,67 @@ #include "opal-fadump.h" + +#ifdef CONFIG_PRESERVE_FA_DUMP +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * ensure crash data is preserved in hope that the subsequent memory + * preserving kernel boot is going to process this crash data. + */ +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const struct opal_fadump_mem_struct *opal_fdm_active; + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + s64 ret; + + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) + return; + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_debug("Could not get Kernel metadata (%lld)\n", ret); + return; + } + + /* + * Preserve memory only if kernel memory regions are registered + * with f/w for MPIPL. + */ + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + opal_fdm_active = (void *)addr; + if (opal_fdm_active->registered_regions == 0) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get boot memory tag (%lld)\n", ret); + return; + } + + /* + * Memory below this address can be used for booting a + * capture kernel or petitboot kernel. Preserve everything + * above this address for processing crashdump. + */ + fadump_conf->boot_mem_top = be64_to_cpu(addr); + pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top); + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; +} + +#else /* CONFIG_PRESERVE_FA_DUMP */ static const struct opal_fadump_mem_struct *opal_fdm_active; static const struct opal_mpipl_fadump *opal_cpu_metadata; static struct opal_fadump_mem_struct *opal_fdm; @@ -183,6 +244,17 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) err = -EPERM; } + /* + * Register boot memory top address with f/w. Should be retrieved + * by a kernel that intends to preserve crash'ed kernel's memory. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM, + fadump_conf->boot_memory_size); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set boot memory tag!\n"); + err = -EPERM; + } + return err; } @@ -649,3 +721,4 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) fadump_conf->dump_active = 1; opal_fadump_get_config(fadump_conf, opal_fdm_active); } +#endif /* !CONFIG_PRESERVE_FA_DUMP */ -- cgit v1.2.3 From 58cf055df47bb115e7c1533eec2a8f66639b80e3 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:26:16 +0530 Subject: powerpc/fadump: update documentation about CONFIG_PRESERVE_FA_DUMP Kernel config option CONFIG_PRESERVE_FA_DUMP is introduced to ensure crash data, from previously crash'ed kernel, is preserved. Update documentation with this details. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821377195.5656.15840065629705958324.stgit@hbathini.in.ibm.com --- Documentation/powerpc/firmware-assisted-dump.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index ab578ac64c09..08a9b5efcdc8 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -101,6 +101,15 @@ or later firmware versions on PowerNV (OPAL) platform. Note that, OPAL firmware exports ibm,opal/dump node when FADump is supported on PowerNV platform. +On OPAL based machines, system first boots into an intermittent +kernel (referred to as petitboot kernel) before booting into the +capture kernel. This kernel would have minimal kernel and/or +userspace support to process crash data. Such kernel needs to +preserve previously crash'ed kernel's memory for the subsequent +capture kernel boot to process this crash data. Kernel config +option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel +to ensure that crash data is preserved to process later. + Implementation details: ----------------------- -- cgit v1.2.3 From 6f713d18144ce86c9f01cdf64222d6339e26129e Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:26:33 +0530 Subject: powerpc/opalcore: export /sys/firmware/opal/core for analysing opal crashes Export /sys/firmware/opal/core file to analyze opal crashes. Since OPAL core can be generated independent of CONFIG_FA_DUMP support in kernel, add this support under a new kernel config option CONFIG_OPAL_CORE. Also, avoid code duplication by moving common code used while exporting /proc/vmcore and/or /sys/firmware/opal/core file(s). Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821378503.5656.3693769384945087756.stgit@hbathini.in.ibm.com --- arch/powerpc/Kconfig | 9 + arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/opal-core.c | 598 +++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/opal-fadump.c | 74 +--- arch/powerpc/platforms/powernv/opal-fadump.h | 62 +++ 5 files changed, 687 insertions(+), 57 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/opal-core.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 62c483133953..eea6d5095f71 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -592,6 +592,15 @@ config PRESERVE_FA_DUMP memory preserving kernel boot would process this crash data. Petitboot kernel is the typical usecase for this option. +config OPAL_CORE + bool "Export OPAL memory as /sys/firmware/opal/core" + depends on PPC64 && PPC_POWERNV + help + This option uses the MPIPL support in firmware to provide an + ELF core of OPAL memory after a crash. The ELF core is exported + as /sys/firmware/opal/core file which is helpful in debugging + OPAL crashes using GDB. + config IRQ_ALL_CPUS bool "Distribute interrupts on all CPUs by default" depends on SMP diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 2d13e6462339..a3ac9646119d 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -9,6 +9,7 @@ obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_FA_DUMP) += opal-fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_OPAL_CORE) += opal-core.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c new file mode 100644 index 000000000000..6fa6b8a6d510 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interface for exporting the OPAL ELF core. + * Heavily inspired from fs/proc/vmcore.c + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal core: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "opal-fadump.h" + +#define MAX_PT_LOAD_CNT 8 + +/* NT_AUXV note related info */ +#define AUXV_CNT 1 +#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off)) + +struct opalcore_config { + u32 num_cpus; + /* PIR value of crashing CPU */ + u32 crashing_cpu; + + /* CPU state data info from F/W */ + u64 cpu_state_destination_vaddr; + u64 cpu_state_data_size; + u64 cpu_state_entry_size; + + /* OPAL memory to be exported as PT_LOAD segments */ + u64 ptload_addr[MAX_PT_LOAD_CNT]; + u64 ptload_size[MAX_PT_LOAD_CNT]; + u64 ptload_cnt; + + /* Pointer to the first PT_LOAD in the ELF core file */ + Elf64_Phdr *ptload_phdr; + + /* Total size of opalcore file. */ + size_t opalcore_size; + + /* Buffer for all the ELF core headers and the PT_NOTE */ + size_t opalcorebuf_sz; + char *opalcorebuf; + + /* NT_AUXV buffer */ + char auxv_buf[AUXV_DESC_SZ]; +}; + +struct opalcore { + struct list_head list; + u64 paddr; + size_t size; + loff_t offset; +}; + +static LIST_HEAD(opalcore_list); +static struct opalcore_config *oc_conf; +static const struct opal_mpipl_fadump *opalc_metadata; +static const struct opal_mpipl_fadump *opalc_cpu_metadata; + +/* + * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered + * by kernel, SIGTERM otherwise. + */ +bool kernel_initiated; + +static struct opalcore * __init get_new_element(void) +{ + return kzalloc(sizeof(struct opalcore), GFP_KERNEL); +} + +static inline int is_opalcore_usable(void) +{ + return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0; +} + +static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name, + u32 type, void *data, + size_t data_len) +{ + Elf64_Nhdr *note = (Elf64_Nhdr *)buf; + Elf64_Word namesz = strlen(name) + 1; + + note->n_namesz = cpu_to_be32(namesz); + note->n_descsz = cpu_to_be32(data_len); + note->n_type = cpu_to_be32(type); + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word)); + memcpy(buf, name, namesz); + buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word)); + + return buf; +} + +static void fill_prstatus(struct elf_prstatus *prstatus, int pir, + struct pt_regs *regs) +{ + memset(prstatus, 0, sizeof(struct elf_prstatus)); + elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs); + + /* + * Overload PID with PIR value. + * As a PIR value could also be '0', add an offset of '100' + * to every PIR to avoid misinterpretations in GDB. + */ + prstatus->pr_pid = cpu_to_be32(100 + pir); + prstatus->pr_ppid = cpu_to_be32(1); + + /* + * Indicate SIGUSR1 for crash initiated from kernel. + * SIGTERM otherwise. + */ + if (pir == oc_conf->crashing_cpu) { + short sig; + + sig = kernel_initiated ? SIGUSR1 : SIGTERM; + prstatus->pr_cursig = cpu_to_be16(sig); + } +} + +static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf, + u64 opal_boot_entry) +{ + Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf; + int idx = 0; + + memset(bufp, 0, AUXV_DESC_SZ); + + /* Entry point of OPAL */ + bufp[idx++] = cpu_to_be64(AT_ENTRY); + bufp[idx++] = cpu_to_be64(opal_boot_entry); + + /* end of vector */ + bufp[idx++] = cpu_to_be64(AT_NULL); + + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV, + oc_conf->auxv_buf, AUXV_DESC_SZ); + return buf; +} + +/* + * Read from the ELF header and then the crash dump. + * Returns number of bytes read on success, -errno on failure. + */ +static ssize_t read_opalcore(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct opalcore *m; + ssize_t tsz, avail; + loff_t tpos = pos; + + if (pos >= oc_conf->opalcore_size) + return 0; + + /* Adjust count if it goes beyond opalcore size */ + avail = oc_conf->opalcore_size - pos; + if (count > avail) + count = avail; + + if (count == 0) + return 0; + + /* Read ELF core header and/or PT_NOTE segment */ + if (tpos < oc_conf->opalcorebuf_sz) { + tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count); + memcpy(to, oc_conf->opalcorebuf + tpos, tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + + list_for_each_entry(m, &opalcore_list, list) { + /* nothing more to read here */ + if (count == 0) + break; + + if (tpos < m->offset + m->size) { + void *addr; + + tsz = min_t(size_t, m->offset + m->size - tpos, count); + addr = (void *)(m->paddr + tpos - m->offset); + memcpy(to, __va(addr), tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + } + + return (tpos - pos); +} + +static struct bin_attribute opal_core_attr = { + .attr = {.name = "core", .mode = 0400}, + .read = read_opalcore +}; + +/* + * Read CPU state dump data and convert it into ELF notes. + * + * Each register entry is of 16 bytes, A numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. + */ +static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + struct elf_prstatus prstatus; + Elf64_Word *first_cpu_note; + struct pt_regs regs; + char *bufp; + int i; + + size_per_thread = oc_conf->cpu_state_entry_size; + bufp = __va(oc_conf->cpu_state_destination_vaddr); + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", oc_conf->num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + /* + * Skip past the first CPU note. Fill this note with the + * crashing CPU's prstatus. + */ + first_cpu_note = buf; + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + + for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + thread_pir = be32_to_cpu(thdr->pir); + + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, false, ®s); + + pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir, + be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip)); + fill_prstatus(&prstatus, thread_pir, ®s); + + if (thread_pir != oc_conf->crashing_cpu) { + buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } else { + /* + * Add crashing CPU as the first NT_PRSTATUS note for + * GDB to process the core file appropriately. + */ + append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } + } + + return buf; +} + +static int __init create_opalcore(void) +{ + u64 opal_boot_entry, opal_base_addr, paddr; + u32 hdr_size, cpu_notes_size, count; + struct device_node *dn; + struct opalcore *new; + loff_t opalcore_off; + struct page *page; + Elf64_Phdr *phdr; + Elf64_Ehdr *elf; + int i, ret; + char *bufp; + + /* Get size of header & CPU notes for OPAL core */ + hdr_size = (sizeof(Elf64_Ehdr) + + ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr))); + cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + + CRASH_CORE_NOTE_DESC_BYTES)) + + (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ)); + + /* Allocate buffer to setup OPAL core */ + oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size); + oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz, + GFP_KERNEL | __GFP_ZERO); + if (!oc_conf->opalcorebuf) { + pr_err("Not enough memory to setup OPAL core (size: %lu)\n", + oc_conf->opalcorebuf_sz); + oc_conf->opalcorebuf_sz = 0; + return -ENOMEM; + } + count = oc_conf->opalcorebuf_sz / PAGE_SIZE; + page = virt_to_page(oc_conf->opalcorebuf); + for (i = 0; i < count; i++) + mark_page_reserved(page + i); + + pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf); + + /* Read OPAL related device-tree entries */ + dn = of_find_node_by_name(NULL, "ibm,opal"); + if (dn) { + ret = of_property_read_u64(dn, "opal-base-address", + &opal_base_addr); + pr_debug("opal-base-address: %llx\n", opal_base_addr); + ret |= of_property_read_u64(dn, "opal-boot-address", + &opal_boot_entry); + pr_debug("opal-boot-address: %llx\n", opal_boot_entry); + } + if (!dn || ret) + pr_warn("WARNING: Failed to read OPAL base & entry values\n"); + + /* Use count to keep track of the program headers */ + count = 0; + + bufp = oc_conf->opalcorebuf; + elf = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELFDATA2MSB; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = cpu_to_be16(ET_CORE); + elf->e_machine = cpu_to_be16(ELF_ARCH); + elf->e_version = cpu_to_be32(EV_CURRENT); + elf->e_entry = 0; + elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr)); + elf->e_shoff = 0; + elf->e_flags = 0; + + elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr)); + elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr)); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_NOTE); + phdr->p_flags = 0; + phdr->p_align = 0; + phdr->p_paddr = phdr->p_vaddr = 0; + phdr->p_offset = cpu_to_be64(hdr_size); + phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size); + count++; + + opalcore_off = oc_conf->opalcorebuf_sz; + oc_conf->ptload_phdr = (Elf64_Phdr *)bufp; + paddr = 0; + for (i = 0; i < oc_conf->ptload_cnt; i++) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_LOAD); + phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X); + phdr->p_align = 0; + + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = oc_conf->ptload_addr[i]; + new->size = oc_conf->ptload_size[i]; + new->offset = opalcore_off; + list_add_tail(&new->list, &opalcore_list); + + phdr->p_paddr = cpu_to_be64(paddr); + phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr); + phdr->p_filesz = phdr->p_memsz = + cpu_to_be64(oc_conf->ptload_size[i]); + phdr->p_offset = cpu_to_be64(opalcore_off); + + count++; + opalcore_off += oc_conf->ptload_size[i]; + paddr += oc_conf->ptload_size[i]; + } + + elf->e_phnum = cpu_to_be16(count); + + bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp); + bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry); + + oc_conf->opalcore_size = opalcore_off; + return 0; +} + +static void opalcore_cleanup(void) +{ + if (oc_conf == NULL) + return; + + /* Remove OPAL core sysfs file */ + sysfs_remove_bin_file(opal_kobj, &opal_core_attr); + oc_conf->ptload_phdr = NULL; + oc_conf->ptload_cnt = 0; + + /* free the buffer used for setting up OPAL core */ + if (oc_conf->opalcorebuf) { + void *end = (void *)((u64)oc_conf->opalcorebuf + + oc_conf->opalcorebuf_sz); + + free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL); + oc_conf->opalcorebuf = NULL; + oc_conf->opalcorebuf_sz = 0; + } + + kfree(oc_conf); + oc_conf = NULL; +} +__exitcall(opalcore_cleanup); + +static void __init opalcore_config_init(void) +{ + u32 idx, cpu_data_version; + struct device_node *np; + const __be32 *prop; + u64 addr = 0; + int i, ret; + + np = of_find_node_by_path("/ibm,opal/dump"); + if (np == NULL) + return; + + if (!of_device_is_compatible(np, "ibm,opal-dump")) { + pr_warn("Support missing for this f/w version!\n"); + return; + } + + /* Check if dump has been initiated on last reboot */ + prop = of_get_property(np, "mpipl-boot", NULL); + if (!prop) { + of_node_put(np); + return; + } + + /* Get OPAL metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("OPAL metadata addr: %llx\n", addr); + opalc_metadata = __va(addr); + + /* Get OPAL CPU metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL CPU metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opalc_cpu_metadata = __va(addr); + + /* Allocate memory for config buffer */ + oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL); + if (oc_conf == NULL) + goto error_out; + + /* Parse OPAL metadata */ + if (opalc_metadata->version != OPAL_MPIPL_VERSION) { + pr_warn("Supported OPAL metadata version: %u, found: %u!\n", + OPAL_MPIPL_VERSION, opalc_metadata->version); + pr_warn("WARNING: F/W using newer OPAL metadata format!!\n"); + } + + oc_conf->ptload_cnt = 0; + idx = be32_to_cpu(opalc_metadata->region_cnt); + if (idx > MAX_PT_LOAD_CNT) { + pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)", + MAX_PT_LOAD_CNT, idx); + idx = MAX_PT_LOAD_CNT; + } + for (i = 0; i < idx; i++) { + oc_conf->ptload_addr[oc_conf->ptload_cnt] = + be64_to_cpu(opalc_metadata->region[i].dest); + oc_conf->ptload_size[oc_conf->ptload_cnt++] = + be64_to_cpu(opalc_metadata->region[i].size); + } + oc_conf->ptload_cnt = i; + oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir); + + if (!oc_conf->ptload_cnt) { + pr_err("OPAL memory regions not found\n"); + goto error_out; + } + + /* Parse OPAL CPU metadata */ + cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version); + if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU data version: %u, found: %u!\n", + HDAT_FADUMP_CPU_DATA_VER, cpu_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest); + if (!addr) { + pr_err("CPU state data not found!\n"); + goto error_out; + } + oc_conf->cpu_state_destination_vaddr = (u64)__va(addr); + + oc_conf->cpu_state_data_size = + be64_to_cpu(opalc_cpu_metadata->region[0].size); + oc_conf->cpu_state_entry_size = + be32_to_cpu(opalc_cpu_metadata->cpu_data_size); + + if ((oc_conf->cpu_state_entry_size == 0) || + (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid.\n"); + goto error_out; + } + oc_conf->num_cpus = (oc_conf->cpu_state_data_size / + oc_conf->cpu_state_entry_size); + + of_node_put(np); + return; + +error_out: + pr_err("Could not export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + of_node_put(np); +} + +static int __init opalcore_init(void) +{ + int rc = -1; + + opalcore_config_init(); + + if (oc_conf == NULL) + return rc; + + create_opalcore(); + + /* + * If oc_conf->opalcorebuf= is set in the 2nd kernel, + * then capture the dump. + */ + if (!(is_opalcore_usable())) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + /* Set OPAL core file size */ + opal_core_attr.size = oc_conf->opalcore_size; + + /* Export OPAL core sysfs file */ + rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr); + if (rc != 0) { + pr_err("Failed to export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + return rc; + } + + return 0; +} +fs_initcall(opalcore_init); diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index c66fa96915ae..a89d3a0cccae 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -85,6 +85,10 @@ static const struct opal_fadump_mem_struct *opal_fdm_active; static const struct opal_mpipl_fadump *opal_cpu_metadata; static struct opal_fadump_mem_struct *opal_fdm; +#ifdef CONFIG_OPAL_CORE +extern bool kernel_initiated; +#endif + static int opal_fadump_unregister(struct fw_dump *fadump_conf); static void opal_fadump_update_config(struct fw_dump *fadump_conf, @@ -349,62 +353,6 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf) pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); } -static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, - u32 reg_type, u32 reg_num, - u64 reg_val) -{ - if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { - if (reg_num < 32) - regs->gpr[reg_num] = reg_val; - return; - } - - switch (reg_num) { - case SPRN_CTR: - regs->ctr = reg_val; - break; - case SPRN_LR: - regs->link = reg_val; - break; - case SPRN_XER: - regs->xer = reg_val; - break; - case SPRN_DAR: - regs->dar = reg_val; - break; - case SPRN_DSISR: - regs->dsisr = reg_val; - break; - case HDAT_FADUMP_REG_ID_NIP: - regs->nip = reg_val; - break; - case HDAT_FADUMP_REG_ID_MSR: - regs->msr = reg_val; - break; - case HDAT_FADUMP_REG_ID_CCR: - regs->ccr = reg_val; - break; - } -} - -static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, - unsigned int reg_entry_size, - struct pt_regs *regs) -{ - struct hdat_fadump_reg_entry *reg_entry; - int i; - - memset(regs, 0, sizeof(struct pt_regs)); - - for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { - reg_entry = (struct hdat_fadump_reg_entry *)bufp; - opal_fadump_set_regval_regnum(regs, - be32_to_cpu(reg_entry->reg_type), - be32_to_cpu(reg_entry->reg_num), - be64_to_cpu(reg_entry->reg_val)); - } -} - /* * Verify if CPU state data is available. If available, do a bit of sanity * checking before processing this data. @@ -529,7 +477,7 @@ opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf, continue; opal_fadump_read_regs((bufp + regs_offset), regs_cnt, - reg_esize, ®s); + reg_esize, true, ®s); note_buf = fadump_regs_to_elf_notes(note_buf, ®s); pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", thread_pir, regs.gpr[1], regs.nip); @@ -573,6 +521,18 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) return rc; } +#ifdef CONFIG_OPAL_CORE + /* + * If this is a kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN) + kernel_initiated = true; +#endif + rc = opal_fadump_build_cpu_notes(fadump_conf, fdh); if (rc) return rc; diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h index c0b5d6e42ba6..d0e17d6a2b5c 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.h +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -8,6 +8,8 @@ #ifndef _POWERNV_OPAL_FADUMP_H #define _POWERNV_OPAL_FADUMP_H +#include + /* * OPAL FADump metadata structure format version * @@ -77,4 +79,64 @@ struct hdat_fadump_reg_entry { __be64 reg_val; } __packed; +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, + u32 reg_type, u32 reg_num, + u64 reg_val) +{ + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case HDAT_FADUMP_REG_ID_NIP: + regs->nip = reg_val; + break; + case HDAT_FADUMP_REG_ID_MSR: + regs->msr = reg_val; + break; + case HDAT_FADUMP_REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, + unsigned int reg_entry_size, + bool cpu_endian, + struct pt_regs *regs) +{ + struct hdat_fadump_reg_entry *reg_entry; + u64 val; + int i; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct hdat_fadump_reg_entry *)bufp; + val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) : + reg_entry->reg_val); + opal_fadump_set_regval_regnum(regs, + be32_to_cpu(reg_entry->reg_type), + be32_to_cpu(reg_entry->reg_num), + val); + } +} + #endif /* _POWERNV_OPAL_FADUMP_H */ -- cgit v1.2.3 From 845426f3f3443c3f60737751b7af0142e3020f08 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:26:45 +0530 Subject: powerpc/opalcore: provide an option to invalidate /sys/firmware/opal/core file Writing '1' to /sys/kernel/fadump_release_opalcore would release the memory held by kernel in exporting /sys/firmware/opal/core file. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821380161.5656.17827032108471421830.stgit@hbathini.in.ibm.com --- arch/powerpc/platforms/powernv/opal-core.c | 38 ++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 6fa6b8a6d510..ed895d82c048 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include #include @@ -561,6 +563,36 @@ error_out: of_node_put(np); } +static ssize_t fadump_release_opalcore_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int input = -1; + + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { + if (oc_conf == NULL) { + pr_err("'/sys/firmware/opal/core' file not accessible!\n"); + return -EPERM; + } + + /* + * Take away '/sys/firmware/opal/core' and release all memory + * used for exporting this file. + */ + opalcore_cleanup(); + } else + return -EINVAL; + + return count; +} + +static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore, + 0200, NULL, + fadump_release_opalcore_store); + static int __init opalcore_init(void) { int rc = -1; @@ -593,6 +625,12 @@ static int __init opalcore_init(void) return rc; } + rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr); + if (rc) { + pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n", + rc); + } + return 0; } fs_initcall(opalcore_init); -- cgit v1.2.3 From 7b1b3b48250acbfd7f15ba950d4654b7f02a8300 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:26:59 +0530 Subject: powerpc/fadump: consider f/w load area OPAL loads kernel & initrd at 512MB offset (256MB size), also exported as ibm,opal/dump/fw-load-area. So, if boot memory size of FADump is less than 768MB, kernel memory to be exported as '/proc/vmcore' would be overwritten by f/w while loading kernel & initrd. To avoid such a scenario, enforce a minimum boot memory size of 768MB on OPAL platform and skip using FADump if a newer F/W version loads kernel & initrd above 768MB. Also, irrespective of RMA size, set the minimum boot memory size expected on pseries platform at 320MB. This is to avoid inflating the minimum memory requirements on systems with 512M/1024M RMA size. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821381414.5656.1592867278535469652.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 11 +--------- arch/powerpc/kernel/fadump.c | 16 +++++++++++---- arch/powerpc/platforms/powernv/opal-fadump.c | 30 +++++++++++++++++++++++++++- arch/powerpc/platforms/powernv/opal-fadump.h | 7 +++++++ arch/powerpc/platforms/pseries/rtas-fadump.c | 6 ++++++ arch/powerpc/platforms/pseries/rtas-fadump.h | 9 +++++++++ 6 files changed, 64 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 6c2868d38bd3..a669aaa7c610 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -19,16 +19,6 @@ #define RMA_START 0x0 #define RMA_END (ppc64_rma_size) -/* - * On some Power systems where RMO is 128MB, it still requires minimum of - * 256MB for kernel to boot successfully. When kdump infrastructure is - * configured to save vmcore over network, we run into OOM issue while - * loading modules related to network setup. Hence we need additional 64M - * of memory to avoid OOM issue. - */ -#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ - + (0x1UL << 26)) - /* The upper limit percentage for user specified boot memory size (25%) */ #define MAX_BOOT_MEM_RATIO 4 @@ -128,6 +118,7 @@ struct fadump_ops { u64 (*fadump_init_mem_struct)(struct fw_dump *fadump_conf); u64 (*fadump_get_metadata_size)(void); int (*fadump_setup_metadata)(struct fw_dump *fadump_conf); + u64 (*fadump_get_bootmem_min)(void); int (*fadump_register)(struct fw_dump *fadump_conf); int (*fadump_unregister)(struct fw_dump *fadump_conf); int (*fadump_invalidate)(struct fw_dump *fadump_conf); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 645d9d4d9332..bd49b1f200bf 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -240,10 +240,10 @@ static void fadump_show_config(void) * that is required for a kernel to boot successfully. * */ -static inline unsigned long fadump_calculate_reserve_size(void) +static inline u64 fadump_calculate_reserve_size(void) { + u64 base, size, bootmem_min; int ret; - unsigned long long base, size; if (fw_dump.reserve_bootvar) pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); @@ -293,7 +293,8 @@ static inline unsigned long fadump_calculate_reserve_size(void) if (memory_limit && size > memory_limit) size = memory_limit; - return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); + bootmem_min = fw_dump.ops->fadump_get_bootmem_min(); + return (size > bootmem_min ? size : bootmem_min); } /* @@ -323,8 +324,8 @@ static unsigned long get_fadump_area_size(void) int __init fadump_reserve_mem(void) { + u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; bool is_memblock_bottom_up = memblock_bottom_up(); - u64 base, size, mem_boundary, align = PAGE_SIZE; int ret = 1; if (!fw_dump.fadump_enabled) @@ -350,6 +351,13 @@ int __init fadump_reserve_mem(void) ALIGN(fw_dump.boot_memory_size, align); } #endif + + bootmem_min = fw_dump.ops->fadump_get_bootmem_min(); + if (fw_dump.boot_memory_size < bootmem_min) { + pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%llx\n", + fw_dump.boot_memory_size, bootmem_min); + goto error_out; + } } /* diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index a89d3a0cccae..006648e4d5e6 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -262,6 +263,11 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) return err; } +static u64 opal_fadump_get_bootmem_min(void) +{ + return OPAL_FADUMP_MIN_BOOT_MEM; +} + static int opal_fadump_register(struct fw_dump *fadump_conf) { s64 rc = OPAL_PARAMETER; @@ -606,6 +612,7 @@ static struct fadump_ops opal_fadump_ops = { .fadump_init_mem_struct = opal_fadump_init_mem_struct, .fadump_get_metadata_size = opal_fadump_get_metadata_size, .fadump_setup_metadata = opal_fadump_setup_metadata, + .fadump_get_bootmem_min = opal_fadump_get_bootmem_min, .fadump_register = opal_fadump_register, .fadump_unregister = opal_fadump_unregister, .fadump_invalidate = opal_fadump_invalidate, @@ -620,9 +627,9 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) const __be32 *prop; unsigned long dn; u64 addr = 0; + int i, len; s64 ret; - /* * Check if Firmware-Assisted Dump is supported. if yes, check * if dump has been initiated on last reboot. @@ -638,6 +645,27 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) return; } + prop = of_get_flat_dt_prop(dn, "fw-load-area", &len); + if (prop) { + /* + * Each f/w load area is an (address,size) pair, + * 2 cells each, totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, end; + + base = of_read_number(prop + (i * 4) + 0, 2); + end = base; + end += of_read_number(prop + (i * 4) + 2, 2); + if (end > OPAL_FADUMP_MIN_BOOT_MEM) { + pr_err("F/W load area: 0x%llx-0x%llx\n", + base, end); + pr_err("F/W version not supported!\n"); + return; + } + } + } + fadump_conf->ops = &opal_fadump_ops; fadump_conf->fadump_supported = 1; diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h index d0e17d6a2b5c..e630cb0f108f 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.h +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -10,6 +10,13 @@ #include +/* + * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum + * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't + * mess with crash'ed kernel's memory during MPIPL. + */ +#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL) + /* * OPAL FADump metadata structure format version * diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index bf1e8fc549b5..1d8f2871fa34 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -116,6 +116,11 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) return addr; } +static u64 rtas_fadump_get_bootmem_min(void) +{ + return RTAS_FADUMP_MIN_BOOT_MEM; +} + static int rtas_fadump_register(struct fw_dump *fadump_conf) { unsigned int wait_time; @@ -467,6 +472,7 @@ static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, static struct fadump_ops rtas_fadump_ops = { .fadump_init_mem_struct = rtas_fadump_init_mem_struct, + .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min, .fadump_register = rtas_fadump_register, .fadump_unregister = rtas_fadump_unregister, .fadump_invalidate = rtas_fadump_invalidate, diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h index 531f3f3e42b3..6602ff69e10d 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.h +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -9,6 +9,15 @@ #ifndef _PSERIES_RTAS_FADUMP_H #define _PSERIES_RTAS_FADUMP_H +/* + * On some Power systems where RMO is 128MB, it still requires minimum of + * 256MB for kernel to boot successfully. When kdump infrastructure is + * configured to save vmcore over network, we run into OOM issue while + * loading modules related to network setup. Hence we need additional 64M + * of memory to avoid OOM issue. + */ +#define RTAS_FADUMP_MIN_BOOT_MEM ((0x1UL << 28) + (0x1UL << 26)) + /* Firmware provided dump sections */ #define RTAS_FADUMP_CPU_STATE_DATA 0x0001 #define RTAS_FADUMP_HPTE_REGION 0x0002 -- cgit v1.2.3 From b3bba79d5a7ba27bf7e31e124afbcb386128f2c6 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:27:12 +0530 Subject: powerpc/fadump: update documentation about option to release opalcore With /sys/firmware/opal/core support available on OPAL based machines and an option to the release memory used by kernel in exporting this core file, update FADump documentation with these details. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821382786.5656.13173494907671241231.stgit@hbathini.in.ibm.com --- Documentation/powerpc/firmware-assisted-dump.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index 08a9b5efcdc8..0455a78486d5 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -110,6 +110,16 @@ capture kernel boot to process this crash data. Kernel config option CONFIG_PRESERVE_FA_DUMP has to be enabled on such kernel to ensure that crash data is preserved to process later. +-- On OPAL based machines (PowerNV), if the kernel is build with + CONFIG_OPAL_CORE=y, OPAL memory at the time of crash is also + exported as /sys/firmware/opal/core file. This procfs file is + helpful in debugging OPAL crashes with GDB. The kernel memory + used for exporting this procfs file can be released by echo'ing + '1' to /sys/kernel/fadump_release_opalcore node. + + e.g. + # echo 1 > /sys/kernel/fadump_release_opalcore + Implementation details: ----------------------- @@ -273,6 +283,15 @@ Here is the list of files under kernel sysfs: enhanced to use this interface to release the memory reserved for dump and continue without 2nd reboot. + /sys/kernel/fadump_release_opalcore + + This file is available only on OPAL based machines when FADump is + active during capture kernel. This is used to release the memory + used by the kernel to export /sys/firmware/opal/core file. To + release this memory, echo '1' to it: + + echo 1 > /sys/kernel/fadump_release_opalcore + Here is the list of files under powerpc debugfs: (Assuming debugfs is mounted on /sys/kernel/debug directory.) -- cgit v1.2.3 From becd91d9c5467160984a0380df72fdf71fee82f6 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:27:26 +0530 Subject: powerpc/fadump: remove RMA_START and RMA_END macros RMA_START is defined as '0' and there is even a BUILD_BUG_ON() to make sure it is never anything else. Remove this macro and use '0' instead as code change is needed anyway when it has to be something else. Also, remove unused RMA_END macro. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821384096.5656.15026984053970204652.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 8 ------- arch/powerpc/kernel/fadump.c | 32 +++++++++++++++------------- arch/powerpc/platforms/pseries/rtas-fadump.c | 2 +- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index a669aaa7c610..aec1515ac8f8 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -10,14 +10,6 @@ #define _ASM_POWERPC_FADUMP_INTERNAL_H #ifndef CONFIG_PRESERVE_FA_DUMP -/* - * The RMA region will be saved for later dumping when kernel crashes. - * RMA is Real Mode Area, the first block of logical memory address owned - * by logical partition, containing the storage that may be accessed with - * translate off. - */ -#define RMA_START 0x0 -#define RMA_END (ppc64_rma_size) /* The upper limit percentage for user specified boot memory size (25%) */ #define MAX_BOOT_MEM_RATIO 4 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index bd49b1f200bf..2e139259474d 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -128,18 +128,22 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, * If fadump is registered, check if the memory provided * falls within boot memory area and reserved memory area. */ -int is_fadump_memory_area(u64 addr, ulong size) +int is_fadump_memory_area(u64 addr, unsigned long size) { - u64 d_start = fw_dump.reserve_dump_area_start; - u64 d_end = d_start + fw_dump.reserve_dump_area_size; + u64 d_start, d_end; if (!fw_dump.dump_registered) return 0; + if (!size) + return 0; + + d_start = fw_dump.reserve_dump_area_start; + d_end = d_start + fw_dump.reserve_dump_area_size; if (((addr + size) > d_start) && (addr <= d_end)) return 1; - return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; + return (addr <= fw_dump.boot_memory_size); } int should_fadump_crash(void) @@ -771,14 +775,14 @@ static int fadump_setup_crash_memory_ranges(void) crash_mrange_info.mem_range_cnt = 0; /* - * add the first memory chunk (RMA_START through boot_memory_size) as + * add the first memory chunk (0 through boot_memory_size) as * a separate memory chunk. The reason is, at the time crash firmware * will move the content of this memory chunk to different location * specified during fadump registration. We need to create a separate * program header for this chunk with the correct offset. */ ret = fadump_add_mem_range(&crash_mrange_info, - RMA_START, fw_dump.boot_memory_size); + 0, fw_dump.boot_memory_size); if (ret) return ret; @@ -787,11 +791,9 @@ static int fadump_setup_crash_memory_ranges(void) end = start + (u64)reg->size; /* - * skip the first memory chunk that is already added (RMA_START - * through boot_memory_size). This logic needs a relook if and - * when RMA_START changes to a non-zero value. + * skip the first memory chunk that is already added + * (0 through boot_memory_size). */ - BUILD_BUG_ON(RMA_START != 0); if (start < fw_dump.boot_memory_size) { if (end > fw_dump.boot_memory_size) start = fw_dump.boot_memory_size; @@ -815,7 +817,7 @@ static int fadump_setup_crash_memory_ranges(void) */ static inline unsigned long fadump_relocate(unsigned long paddr) { - if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) + if ((paddr > 0) && (paddr < fw_dump.boot_memory_size)) return fw_dump.boot_mem_dest_addr + paddr; else return paddr; @@ -883,11 +885,11 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_offset = mbase; - if (mbase == RMA_START) { + if (mbase == 0) { /* - * The entire RMA region will be moved by firmware - * to the specified destination_address. Hence set - * the correct offset. + * The entire real memory region will be moved by + * firmware to the specified destination_address. + * Hence set the correct offset. */ phdr->p_offset = fw_dump.boot_mem_dest_addr; } diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 1d8f2871fa34..a525180c1f0f 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -106,7 +106,7 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); fdm.rmr_region.source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); - fdm.rmr_region.source_address = cpu_to_be64(RMA_START); + fdm.rmr_region.source_address = cpu_to_be64(0); fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size); fdm.rmr_region.destination_address = cpu_to_be64(addr); addr += fadump_conf->boot_memory_size; -- cgit v1.2.3 From 7dee93a9a8808b3d8595e1cc79ccb8b1a7bc7a77 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 11 Sep 2019 20:27:39 +0530 Subject: powerpc/fadump: support holes in kernel boot memory area With support to copy multiple kernel boot memory regions owing to copy size limitation, also handle holes in the memory area to be preserved. Support as many as 128 kernel boot memory regions. This allows having an adequate FADump capture kernel size for different scenarios. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/156821385448.5656.6124791213910877759.stgit@hbathini.in.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 12 ++ arch/powerpc/kernel/fadump.c | 191 +++++++++++++++++++++++---- arch/powerpc/platforms/powernv/opal-fadump.c | 54 ++++---- arch/powerpc/platforms/powernv/opal-fadump.h | 5 +- arch/powerpc/platforms/pseries/rtas-fadump.c | 11 +- arch/powerpc/platforms/pseries/rtas-fadump.h | 5 + 6 files changed, 219 insertions(+), 59 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index aec1515ac8f8..c814a2b55389 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -9,6 +9,9 @@ #ifndef _ASM_POWERPC_FADUMP_INTERNAL_H #define _ASM_POWERPC_FADUMP_INTERNAL_H +/* Maximum number of memory regions kernel supports */ +#define FADUMP_MAX_MEM_REGS 128 + #ifndef CONFIG_PRESERVE_FA_DUMP /* The upper limit percentage for user specified boot memory size (25%) */ @@ -88,11 +91,20 @@ struct fw_dump { unsigned long boot_memory_size; u64 boot_mem_dest_addr; + u64 boot_mem_addr[FADUMP_MAX_MEM_REGS]; + u64 boot_mem_sz[FADUMP_MAX_MEM_REGS]; + u64 boot_mem_top; + u64 boot_mem_regs_cnt; unsigned long fadumphdr_addr; unsigned long cpu_notes_buf_vaddr; unsigned long cpu_notes_buf_size; + /* + * Maximum size supported by firmware to copy from source to + * destination address per entry. + */ + u64 max_copy_size; u64 kernel_metadata; int ibm_configure_kernel_dump; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 2e139259474d..ed59855430b9 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -143,7 +143,7 @@ int is_fadump_memory_area(u64 addr, unsigned long size) if (((addr + size) > d_start) && (addr <= d_end)) return 1; - return (addr <= fw_dump.boot_memory_size); + return (addr <= fw_dump.boot_mem_top); } int should_fadump_crash(void) @@ -194,7 +194,20 @@ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) */ bool is_fadump_boot_mem_contiguous(void) { - return is_fadump_mem_area_contiguous(0, fw_dump.boot_memory_size); + unsigned long d_start, d_end; + bool ret = false; + int i; + + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + d_start = fw_dump.boot_mem_addr[i]; + d_end = d_start + fw_dump.boot_mem_sz[i]; + + ret = is_fadump_mem_area_contiguous(d_start, d_end); + if (!ret) + break; + } + + return ret; } /* @@ -213,6 +226,8 @@ bool is_fadump_reserved_mem_contiguous(void) /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { + int i; + pr_debug("Support for firmware-assisted dump (fadump): %s\n", (fw_dump.fadump_supported ? "present" : "no support")); @@ -226,7 +241,13 @@ static void fadump_show_config(void) pr_debug("Dump section sizes:\n"); pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); - pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); + pr_debug(" Boot memory size : %lx\n", fw_dump.boot_memory_size); + pr_debug(" Boot memory top : %llx\n", fw_dump.boot_mem_top); + pr_debug("Boot memory regions cnt: %llx\n", fw_dump.boot_mem_regs_cnt); + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + pr_debug("[%03d] base = %llx, size = %llx\n", i, + fw_dump.boot_mem_addr[i], fw_dump.boot_mem_sz[i]); + } } /** @@ -326,6 +347,88 @@ static unsigned long get_fadump_area_size(void) return size; } +static int __init add_boot_mem_region(unsigned long rstart, + unsigned long rsize) +{ + int i = fw_dump.boot_mem_regs_cnt++; + + if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) { + fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS; + return 0; + } + + pr_debug("Added boot memory range[%d] [%#016lx-%#016lx)\n", + i, rstart, (rstart + rsize)); + fw_dump.boot_mem_addr[i] = rstart; + fw_dump.boot_mem_sz[i] = rsize; + return 1; +} + +/* + * Firmware usually has a hard limit on the data it can copy per region. + * Honour that by splitting a memory range into multiple regions. + */ +static int __init add_boot_mem_regions(unsigned long mstart, + unsigned long msize) +{ + unsigned long rstart, rsize, max_size; + int ret = 1; + + rstart = mstart; + max_size = fw_dump.max_copy_size ? fw_dump.max_copy_size : msize; + while (msize) { + if (msize > max_size) + rsize = max_size; + else + rsize = msize; + + ret = add_boot_mem_region(rstart, rsize); + if (!ret) + break; + + msize -= rsize; + rstart += rsize; + } + + return ret; +} + +static int __init fadump_get_boot_mem_regions(void) +{ + unsigned long base, size, cur_size, hole_size, last_end; + unsigned long mem_size = fw_dump.boot_memory_size; + struct memblock_region *reg; + int ret = 1; + + fw_dump.boot_mem_regs_cnt = 0; + + last_end = 0; + hole_size = 0; + cur_size = 0; + for_each_memblock(memory, reg) { + base = reg->base; + size = reg->size; + hole_size += (base - last_end); + + if ((cur_size + size) >= mem_size) { + size = (mem_size - cur_size); + ret = add_boot_mem_regions(base, size); + break; + } + + mem_size -= size; + cur_size += size; + ret = add_boot_mem_regions(base, size); + if (!ret) + break; + + last_end = base + size; + } + fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size); + + return ret; +} + int __init fadump_reserve_mem(void) { u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; @@ -362,6 +465,11 @@ int __init fadump_reserve_mem(void) fw_dump.boot_memory_size, bootmem_min); goto error_out; } + + if (!fadump_get_boot_mem_regions()) { + pr_err("Too many holes in boot memory area to enable fadump\n"); + goto error_out; + } } /* @@ -385,7 +493,7 @@ int __init fadump_reserve_mem(void) else mem_boundary = memblock_end_of_DRAM(); - base = fw_dump.boot_memory_size; + base = fw_dump.boot_mem_top; size = get_fadump_area_size(); fw_dump.reserve_dump_area_size = size; if (fw_dump.dump_active) { @@ -769,34 +877,35 @@ static int fadump_setup_crash_memory_ranges(void) { struct memblock_region *reg; u64 start, end; - int ret; + int i, ret; pr_debug("Setup crash memory ranges.\n"); crash_mrange_info.mem_range_cnt = 0; /* - * add the first memory chunk (0 through boot_memory_size) as - * a separate memory chunk. The reason is, at the time crash firmware - * will move the content of this memory chunk to different location - * specified during fadump registration. We need to create a separate - * program header for this chunk with the correct offset. + * Boot memory region(s) registered with firmware are moved to + * different location at the time of crash. Create separate program + * header(s) for this memory chunk(s) with the correct offset. */ - ret = fadump_add_mem_range(&crash_mrange_info, - 0, fw_dump.boot_memory_size); - if (ret) - return ret; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + start = fw_dump.boot_mem_addr[i]; + end = start + fw_dump.boot_mem_sz[i]; + ret = fadump_add_mem_range(&crash_mrange_info, start, end); + if (ret) + return ret; + } for_each_memblock(memory, reg) { start = (u64)reg->base; end = start + (u64)reg->size; /* - * skip the first memory chunk that is already added - * (0 through boot_memory_size). + * skip the memory chunk that is already added + * (0 through boot_memory_top). */ - if (start < fw_dump.boot_memory_size) { - if (end > fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + if (start < fw_dump.boot_mem_top) { + if (end > fw_dump.boot_mem_top) + start = fw_dump.boot_mem_top; else continue; } @@ -817,17 +926,35 @@ static int fadump_setup_crash_memory_ranges(void) */ static inline unsigned long fadump_relocate(unsigned long paddr) { - if ((paddr > 0) && (paddr < fw_dump.boot_memory_size)) - return fw_dump.boot_mem_dest_addr + paddr; - else - return paddr; + unsigned long raddr, rstart, rend, rlast, hole_size; + int i; + + hole_size = 0; + rlast = 0; + raddr = paddr; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + rstart = fw_dump.boot_mem_addr[i]; + rend = rstart + fw_dump.boot_mem_sz[i]; + hole_size += (rstart - rlast); + + if (paddr >= rstart && paddr < rend) { + raddr += fw_dump.boot_mem_dest_addr - hole_size; + break; + } + + rlast = rend; + } + + pr_debug("vmcoreinfo: paddr = 0x%lx, raddr = 0x%lx\n", paddr, raddr); + return raddr; } static int fadump_create_elfcore_headers(char *bufp) { - struct elfhdr *elf; + unsigned long long raddr, offset; struct elf_phdr *phdr; - int i; + struct elfhdr *elf; + int i, j; fadump_init_elfcore_header(bufp); elf = (struct elfhdr *)bufp; @@ -870,7 +997,9 @@ static int fadump_create_elfcore_headers(char *bufp) (elf->e_phnum)++; /* setup PT_LOAD sections. */ - + j = 0; + offset = 0; + raddr = fw_dump.boot_mem_addr[0]; for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) { u64 mbase, msize; @@ -885,13 +1014,17 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_offset = mbase; - if (mbase == 0) { + if (mbase == raddr) { /* * The entire real memory region will be moved by * firmware to the specified destination_address. * Hence set the correct offset. */ - phdr->p_offset = fw_dump.boot_mem_dest_addr; + phdr->p_offset = fw_dump.boot_mem_dest_addr + offset; + if (j < (fw_dump.boot_mem_regs_cnt - 1)) { + offset += fw_dump.boot_mem_sz[j]; + raddr = fw_dump.boot_mem_addr[++j]; + } } phdr->p_paddr = mbase; @@ -1177,7 +1310,7 @@ static void fadump_invalidate_release_mem(void) fadump_cleanup(); mutex_unlock(&fadump_mutex); - fadump_release_memory(fw_dump.boot_memory_size, memblock_end_of_DRAM()); + fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM()); fadump_free_cpu_notes_buf(); /* diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 006648e4d5e6..d361d37d975f 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -115,19 +115,28 @@ static void opal_fadump_update_config(struct fw_dump *fadump_conf, static void opal_fadump_get_config(struct fw_dump *fadump_conf, const struct opal_fadump_mem_struct *fdm) { + unsigned long base, size, last_end, hole_size; int i; if (!fadump_conf->dump_active) return; + last_end = 0; + hole_size = 0; fadump_conf->boot_memory_size = 0; pr_debug("Boot memory regions:\n"); for (i = 0; i < fdm->region_cnt; i++) { - pr_debug("\t%d. base: 0x%llx, size: 0x%llx\n", - (i + 1), fdm->rgn[i].src, fdm->rgn[i].size); + base = fdm->rgn[i].src; + size = fdm->rgn[i].size; + pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size); - fadump_conf->boot_memory_size += fdm->rgn[i].size; + fadump_conf->boot_mem_addr[i] = base; + fadump_conf->boot_mem_sz[i] = size; + fadump_conf->boot_memory_size += size; + hole_size += (base - last_end); + + last_end = base + size; } /* @@ -160,6 +169,8 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf, pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n"); } + fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size); + fadump_conf->boot_mem_regs_cnt = fdm->region_cnt; opal_fadump_update_config(fadump_conf, fdm); } @@ -174,33 +185,20 @@ static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) { - int max_copy_size, cur_size, size; - u64 src_addr, dest_addr; + u64 addr = fadump_conf->reserve_dump_area_start; + int i; opal_fdm = __va(fadump_conf->kernel_metadata); opal_fadump_init_metadata(opal_fdm); - /* - * Firmware supports 32-bit field for size. Align it to PAGE_SIZE - * and request firmware to copy multiple kernel boot memory regions. - */ - max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE); - /* Boot memory regions */ - src_addr = 0; - dest_addr = fadump_conf->reserve_dump_area_start; - size = fadump_conf->boot_memory_size; - while (size) { - cur_size = size > max_copy_size ? max_copy_size : size; - - opal_fdm->rgn[opal_fdm->region_cnt].src = src_addr; - opal_fdm->rgn[opal_fdm->region_cnt].dest = dest_addr; - opal_fdm->rgn[opal_fdm->region_cnt].size = cur_size; + for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) { + opal_fdm->rgn[i].src = fadump_conf->boot_mem_addr[i]; + opal_fdm->rgn[i].dest = addr; + opal_fdm->rgn[i].size = fadump_conf->boot_mem_sz[i]; opal_fdm->region_cnt++; - dest_addr += cur_size; - src_addr += cur_size; - size -= cur_size; + addr += fadump_conf->boot_mem_sz[i]; } /* @@ -212,7 +210,7 @@ static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) opal_fadump_update_config(fadump_conf, opal_fdm); - return dest_addr; + return addr; } static u64 opal_fadump_get_metadata_size(void) @@ -254,7 +252,7 @@ static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) * by a kernel that intends to preserve crash'ed kernel's memory. */ ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM, - fadump_conf->boot_memory_size); + fadump_conf->boot_mem_top); if (ret != OPAL_SUCCESS) { pr_err("Failed to set boot memory tag!\n"); err = -EPERM; @@ -669,6 +667,12 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) fadump_conf->ops = &opal_fadump_ops; fadump_conf->fadump_supported = 1; + /* + * Firmware supports 32-bit field for size. Align it to PAGE_SIZE + * and request firmware to copy multiple kernel boot memory regions. + */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE); + /* * Check if dump has been initiated on last reboot. */ diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h index e630cb0f108f..f1e9ecf548c5 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.h +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -27,9 +27,6 @@ */ #define OPAL_FADUMP_VERSION 0x1 -/* Maximum number of memory regions kernel supports */ -#define OPAL_FADUMP_MAX_MEM_REGS 128 - /* * OPAL FADump kernel metadata * @@ -42,7 +39,7 @@ struct opal_fadump_mem_struct { u16 region_cnt; /* number of regions */ u16 registered_regions; /* Regions registered for MPIPL */ u64 fadumphdr_addr; - struct opal_mpipl_region rgn[OPAL_FADUMP_MAX_MEM_REGS]; + struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS]; } __packed; /* diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index a525180c1f0f..70c3013fdd07 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -42,7 +42,13 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf, static void rtas_fadump_get_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) { - fadump_conf->boot_memory_size = be64_to_cpu(fdm->rmr_region.source_len); + fadump_conf->boot_mem_addr[0] = + be64_to_cpu(fdm->rmr_region.source_address); + fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len); + fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0]; + + fadump_conf->boot_mem_top = fadump_conf->boot_memory_size; + fadump_conf->boot_mem_regs_cnt = 1; /* * Start address of reserve dump area (permanent reservation) for @@ -499,6 +505,9 @@ void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) fadump_conf->ops = &rtas_fadump_ops; fadump_conf->fadump_supported = 1; + /* Firmware supports 64-bit value for size, align it to pagesize. */ + fadump_conf->max_copy_size = _ALIGN_DOWN(U64_MAX, PAGE_SIZE); + /* * The 'ibm,kernel-dump' rtas node is present only if there is * dump data waiting for us. diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h index 6602ff69e10d..fd59bd7ca9c3 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.h +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -69,6 +69,11 @@ struct rtas_fadump_mem_struct { /* Kernel dump sections */ struct rtas_fadump_section cpu_state_data; struct rtas_fadump_section hpte_region; + + /* + * TODO: Extend multiple boot memory regions support in the kernel + * for this platform. + */ struct rtas_fadump_section rmr_region; }; -- cgit v1.2.3 From ec5b705c48365549c483fab17d68d15d83bef265 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 23 Aug 2019 10:22:00 -0400 Subject: powerpc/mm/radix: remove useless kernel messages Booting a POWER9 PowerNV system generates a few messages below with "____ptrval____" due to the pointers printed without a specifier extension (i.e unadorned %p) are hashed to prevent leaking information about the kernel memory layout. radix-mmu: Initializing Radix MMU radix-mmu: Partition table (____ptrval____) radix-mmu: Mapped 0x0000000000000000-0x0000000040000000 with 1.00 GiB pages (exec) radix-mmu: Mapped 0x0000000040000000-0x0000002000000000 with 1.00 GiB pages radix-mmu: Mapped 0x0000200000000000-0x0000202000000000 with 1.00 GiB pages radix-mmu: Process table (____ptrval____) and radix root for kernel: (____ptrval____) Signed-off-by: Qian Cai Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1566570120-16529-1-git-send-email-cai@lca.pw --- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 0d1107fb34c1..3a1fbf9cb8f8 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -369,8 +369,6 @@ static void __init radix_init_pgtable(void) rts_field = radix__get_tree_size(); process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); - pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); - /* * The init_mm context is given the first available (non-zero) PID, * which is the "guard PID" and contains no page table. PIDR should @@ -399,7 +397,6 @@ static void __init radix_init_partition_table(void) mmu_partition_table_set_entry(0, dw0, dw1, false); pr_info("Initializing Radix MMU\n"); - pr_info("Partition table %p\n", partition_tb); } static int __init get_idx_from_shift(unsigned int shift) -- cgit v1.2.3 From 5896163f7f91c0560cc41908c808661eee4c4121 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 10 Sep 2019 10:18:49 +0200 Subject: powerpc/xmon: Improve output of XIVE interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When looping on the list of interrupts, add the current value of the PQ bits with a load on the ESB page. This has the side effect of faulting the ESB page of all interrupts. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190910081850.26038-2-clg@kaod.org --- arch/powerpc/include/asm/xive.h | 3 +-- arch/powerpc/sysdev/xive/common.c | 29 ++++++++++++++++++++++++++--- arch/powerpc/xmon/xmon.c | 15 ++++----------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 967d6ab3c977..71f52f22c36b 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -99,8 +99,7 @@ extern void xive_flush_interrupt(void); /* xmon hook */ extern void xmon_xive_do_dump(int cpu); -extern int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio, - u32 *sw_irq); +extern int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d); /* APIs used by KVM */ extern u32 xive_native_default_eq_shift(void); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f75a660365e5..b10919f08250 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -258,10 +258,33 @@ notrace void xmon_xive_do_dump(int cpu) #endif } -int xmon_xive_get_irq_config(u32 irq, u32 *target, u8 *prio, - u32 *sw_irq) +int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) { - return xive_ops->get_irq_config(irq, target, prio, sw_irq); + int rc; + u32 target; + u8 prio; + u32 lirq; + + rc = xive_ops->get_irq_config(hw_irq, &target, &prio, &lirq); + if (rc) { + xmon_printf("IRQ 0x%08x : no config rc=%d\n", hw_irq, rc); + return rc; + } + + xmon_printf("IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ", + hw_irq, target, prio, lirq); + + if (d) { + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + u64 val = xive_esb_read(xd, XIVE_ESB_GET); + + xmon_printf("PQ=%c%c", + val & XIVE_ESB_VAL_P ? 'P' : '-', + val & XIVE_ESB_VAL_Q ? 'Q' : '-'); + } + + xmon_printf("\n"); + return 0; } #endif /* CONFIG_XMON */ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index dc9832e06256..d83364ebc5c5 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2572,16 +2572,9 @@ static void dump_all_xives(void) dump_one_xive(cpu); } -static void dump_one_xive_irq(u32 num) +static void dump_one_xive_irq(u32 num, struct irq_data *d) { - int rc; - u32 target; - u8 prio; - u32 lirq; - - rc = xmon_xive_get_irq_config(num, &target, &prio, &lirq); - xmon_printf("IRQ 0x%08x : target=0x%x prio=%d lirq=0x%x (rc=%d)\n", - num, target, prio, lirq, rc); + xmon_xive_get_irq_config(num, d); } static void dump_all_xive_irq(void) @@ -2599,7 +2592,7 @@ static void dump_all_xive_irq(void) hwirq = (unsigned int)irqd_to_hwirq(d); /* IPIs are special (HW number 0) */ if (hwirq) - dump_one_xive_irq(hwirq); + dump_one_xive_irq(hwirq, d); } } @@ -2619,7 +2612,7 @@ static void dump_xives(void) return; } else if (c == 'i') { if (scanhex(&num)) - dump_one_xive_irq(num); + dump_one_xive_irq(num, NULL); else dump_all_xive_irq(); return; -- cgit v1.2.3 From 855d9140a394229f21fd4fd216f377ed45bd93a3 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 10 Sep 2019 10:18:50 +0200 Subject: powerpc/xmon: Fix output of XIVE IPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When dumping the XIVE state of an CPU IPI, xmon does not check if the CPU is started or not which can cause an error. Add a check for that and change the output to be on one line just as the XIVE interrupts of the machine. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190910081850.26038-3-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index b10919f08250..df832b09e3e9 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -237,25 +237,30 @@ static notrace void xive_dump_eq(const char *name, struct xive_q *q) i0 = be32_to_cpup(q->qpage + idx); idx = (idx + 1) & q->msk; i1 = be32_to_cpup(q->qpage + idx); - xmon_printf(" %s Q T=%d %08x %08x ...\n", name, - q->toggle, i0, i1); + xmon_printf("%s idx=%d T=%d %08x %08x ...", name, + q->idx, q->toggle, i0, i1); } notrace void xmon_xive_do_dump(int cpu) { struct xive_cpu *xc = per_cpu(xive_cpu, cpu); - xmon_printf("XIVE state for CPU %d:\n", cpu); - xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr); - xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); + xmon_printf("CPU %d:", cpu); + if (xc) { + xmon_printf("pp=%02x CPPR=%02x ", xc->pending_prio, xc->cppr); + #ifdef CONFIG_SMP - { - u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET); - xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, - val & XIVE_ESB_VAL_P ? 'P' : 'p', - val & XIVE_ESB_VAL_Q ? 'Q' : 'q'); - } + { + u64 val = xive_esb_read(&xc->ipi_data, XIVE_ESB_GET); + + xmon_printf("IPI=0x%08x PQ=%c%c ", xc->hw_ipi, + val & XIVE_ESB_VAL_P ? 'P' : '-', + val & XIVE_ESB_VAL_Q ? 'Q' : '-'); + } #endif + xive_dump_eq("EQ", &xc->queue[xive_irq_priority]); + } + xmon_printf("\n"); } int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d) -- cgit v1.2.3 From ad9b48c52296ddb7620e3f2b66a7479b78632dff Mon Sep 17 00:00:00 2001 From: Adam Zerella Date: Sun, 15 Sep 2019 15:29:05 +1000 Subject: docs: powerpc: Add missing documentation reference The documentation pages for 'elfnote' and 'ultravisor' are not included in the powerpc documentation index, this generates Sphinx warnings: WARNING: document isn't included in any toctree Additionally, when one includes these missing doc pages, more Sphinx warnings appear. Unused footnote references, syntax highlighting and table of content ordering has been adjusted. Signed-off-by: Adam Zerella Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190915052905.13431-1-adam.zerella@gmail.com --- Documentation/powerpc/elfnote.rst | 29 ++++++++++++++--------------- Documentation/powerpc/index.rst | 2 ++ Documentation/powerpc/ultravisor.rst | 13 +++++-------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/Documentation/powerpc/elfnote.rst b/Documentation/powerpc/elfnote.rst index 2a5c4beeb809..06602248621c 100644 --- a/Documentation/powerpc/elfnote.rst +++ b/Documentation/powerpc/elfnote.rst @@ -8,8 +8,7 @@ capabilities and information which can be used by a bootloader or userland. Types and Descriptors --------------------- -The types to be used with the "PowerPC" namesapce are defined in the -include/uapi/asm/elfnote.h +The types to be used with the "PowerPC" namesapce are defined in [#f1]_. 1) PPC_ELFNOTE_CAPABILITIES @@ -18,25 +17,25 @@ bitmap as "descriptor" field. Each bit is described below: - Ultravisor-capable bit (PowerNV only). +.. code-block:: c + #define PPCCAP_ULTRAVISOR_BIT (1 << 0) - Indicate that the powerpc kernel binary knows how to run in an - ultravisor-enabled system. +Indicate that the powerpc kernel binary knows how to run in an +ultravisor-enabled system. - In an ultravisor-enabled system, some machine resources are now controlled - by the ultravisor. If the kernel is not ultravisor-capable, but it ends up - being run on a machine with ultravisor, the kernel will probably crash - trying to access ultravisor resources. For instance, it may crash in early - boot trying to set the partition table entry 0. +In an ultravisor-enabled system, some machine resources are now controlled +by the ultravisor. If the kernel is not ultravisor-capable, but it ends up +being run on a machine with ultravisor, the kernel will probably crash +trying to access ultravisor resources. For instance, it may crash in early +boot trying to set the partition table entry 0. - In an ultravisor-enabled system, a bootloader could warn the user or prevent - the kernel from being run if the PowerPC ultravisor capability doesn't exist - or the Ultravisor-capable bit is not set. +In an ultravisor-enabled system, a bootloader could warn the user or prevent +the kernel from being run if the PowerPC ultravisor capability doesn't exist +or the Ultravisor-capable bit is not set. References ---------- -arch/powerpc/include/asm/elfnote.h -arch/powerpc/kernel/note.S - +.. [#f1] arch/powerpc/include/asm/elfnote.h diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index 549b1cdd77ae..db7b6a880f52 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -15,6 +15,7 @@ powerpc dawr-power9 dscr eeh-pci-error-recovery + elfnote firmware-assisted-dump hvcs isa-versions @@ -25,6 +26,7 @@ powerpc qe_firmware syscall64-abi transactional_memory + ultravisor .. only:: subproject and html diff --git a/Documentation/powerpc/ultravisor.rst b/Documentation/powerpc/ultravisor.rst index 94a149f34ec3..730854f73830 100644 --- a/Documentation/powerpc/ultravisor.rst +++ b/Documentation/powerpc/ultravisor.rst @@ -1,5 +1,5 @@ .. SPDX-License-Identifier: GPL-2.0 -.. _ultravisor: +.. _ultravisor: ============================ Protected Execution Facility @@ -8,9 +8,6 @@ Protected Execution Facility .. contents:: :depth: 3 -.. sectnum:: - :depth: 3 - Protected Execution Facility ############################ @@ -255,10 +252,10 @@ Ultravisor calls API be made available in the public/OpenPower version of the PAPR specification. - **Note** + .. note:: - If PEF is not enabled, the ultracalls will be redirected to the - Hypervisor which must handle/fail the calls. + If PEF is not enabled, the ultracalls will be redirected to the + Hypervisor which must handle/fail the calls. Ultracalls used by Hypervisor ============================= @@ -1054,4 +1051,4 @@ Use cases References ########## -.. [1] `Supporting Protected Computing on IBM Power Architecture `_ +- `Supporting Protected Computing on IBM Power Architecture `_ -- cgit v1.2.3 From e7ca44ed3ba77fc26cf32650bb71584896662474 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Wed, 4 Sep 2019 13:29:49 +0530 Subject: powerpc: dump kernel log before carrying out fadump or kdump Since commit 4388c9b3a6ee ("powerpc: Do not send system reset request through the oops path"), pstore dmesg file is not updated when dump is triggered from HMC. This commit modified system reset (sreset) handler to invoke fadump or kdump (if configured), without pushing dmesg to pstore. This leaves pstore to have old dmesg data which won't be much of a help if kdump fails to capture the dump. This patch fixes that by calling kmsg_dump() before heading to fadump ot kdump. Fixes: 4388c9b3a6ee ("powerpc: Do not send system reset request through the oops path") Reviewed-by: Mahesh Salgaonkar Reviewed-by: Nicholas Piggin Signed-off-by: Ganesh Goudar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190904075949.15607-1-ganeshgr@linux.ibm.com --- arch/powerpc/kernel/traps.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 11caa0291254..82f43535e686 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -472,6 +472,7 @@ void system_reset_exception(struct pt_regs *regs) if (debugger(regs)) goto out; + kmsg_dump(KMSG_DUMP_OOPS); /* * A system reset is a request to dump, so we always send * it through the crashdump code (if fadump or kdump are -- cgit v1.2.3 From a3db31ff6ce31f5a544a66b61613a098029031cc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 5 Sep 2019 23:50:28 +0530 Subject: ftrace: Look up the address of return_to_handler() using helpers This ensures that we use the right address on architectures that use function descriptors. Signed-off-by: Naveen N. Rao Acked-by: Steven Rostedt (VMware) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/8f6f14d192a994008ac370ce14036bbe67224c7d.1567707399.git.naveen.n.rao@linux.vnet.ibm.com --- kernel/trace/fgraph.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 8dfd5021b933..7950a0356042 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -276,7 +276,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, int index = task->curr_ret_stack; int i; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; if (index < 0) @@ -294,7 +294,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, { int task_idx; - if (ret != (unsigned long)return_to_handler) + if (ret != (unsigned long)dereference_kernel_function_descriptor(return_to_handler)) return ret; task_idx = task->curr_ret_stack; -- cgit v1.2.3 From 370011a27028d6f05e598ed6211a0ca2dc0213f7 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 5 Sep 2019 23:50:29 +0530 Subject: powerpc/ftrace: Enable HAVE_FUNCTION_GRAPH_RET_ADDR_PTR This associates entries in the ftrace_ret_stack with corresponding stack frames, enabling more robust stack unwinding. Also update the only user of ftrace_graph_ret_addr() to pass the stack pointer. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0224f2d0971b069c678e2ff678cfc2cd1e114cfe.1567707399.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/asm-prototypes.h | 3 ++- arch/powerpc/include/asm/ftrace.h | 2 ++ arch/powerpc/kernel/stacktrace.c | 2 +- arch/powerpc/kernel/trace/ftrace.c | 5 +++-- arch/powerpc/kernel/trace/ftrace_32.S | 1 + arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 1 + arch/powerpc/kernel/trace/ftrace_64_pg.S | 1 + 7 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 49196d35e3bb..8561498e653c 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -134,7 +134,8 @@ extern int __ucmpdi2(u64, u64); /* tracing */ void _mcount(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp); void pnv_power9_force_smt4_catch(void); void pnv_power9_force_smt4_release(void); diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 3dfb80b86561..f54a08a2cd70 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -8,6 +8,8 @@ #define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifdef __ASSEMBLY__ /* Based off of objdump optput from glibc */ diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 1e2276963f6d..e2a46cfed5fd 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -182,7 +182,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, * FIXME: IMHO these tests do not belong in * arch-dependent code, they are generic. */ - ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, NULL); + ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack); #ifdef CONFIG_KPROBES /* * Mark stacktraces with kretprobed functions on them diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index be1ca98fce5c..7ea0ca044b65 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -944,7 +944,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. Return the address we want to divert to. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp) { unsigned long return_hooker; @@ -956,7 +957,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) return_hooker = ppc_function_entry(return_to_handler); - if (!function_graph_enter(parent, ip, 0, NULL)) + if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) parent = return_hooker; out: return parent; diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index 183f608efb81..e023ae59c429 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -50,6 +50,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addi r5, r1, 48 /* load r4 with local address */ lwz r4, 44(r1) subi r4, r4, MCOUNT_INSN_SIZE diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 74acbf16a666..f9fd5f743eba 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -294,6 +294,7 @@ _GLOBAL(ftrace_graph_caller) std r2, 24(r1) ld r2, PACATOC(r13) /* get kernel TOC in r2 */ + addi r5, r1, 112 mfctr r4 /* ftrace_caller has moved local addr here */ std r4, 40(r1) mflr r3 /* ftrace_caller has restored LR from stack */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S index e41a7d13c99c..6708e24db0ab 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.S +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S @@ -41,6 +41,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addi r5, r1, 112 /* load r4 with local address */ ld r4, 128(r1) subi r4, r4, MCOUNT_INSN_SIZE -- cgit v1.2.3 From 7c1bb6bbf75d8ca5ec878627d3170effcaf54f27 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 5 Sep 2019 23:50:30 +0530 Subject: powerpc: Use ftrace_graph_ret_addr() when unwinding With support for HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, ftrace_graph_ret_addr() provides more robust unwinding when function graph is in use. Update show_stack() to use the same. With dump_stack() added to sysrq_sysctl_handler(), before this patch: root@(none):/sys/kernel/debug/tracing# cat /proc/sys/kernel/sysrq CPU: 0 PID: 218 Comm: cat Not tainted 5.3.0-rc7-00868-g8453ad4a078c-dirty #20 Call Trace: [c0000000d1e13c30] [c00000000006ab98] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable) [c0000000d1e13c80] [c000000000145680] sysrq_sysctl_handler+0x48/0xb8 [c0000000d1e13cd0] [c00000000006ab98] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0) [c0000000d1e13d60] [c00000000006ab98] return_to_handler+0x0/0x40 (return_to_handler+0x0/0x40) [c0000000d1e13d80] [c00000000006ab98] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70) [c0000000d1e13dd0] [c00000000006ab98] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0) [c0000000d1e13e20] [c00000000006ab98] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140) After this patch: Call Trace: [c0000000d1e33c30] [c00000000006ab58] return_to_handler+0x0/0x40 (dump_stack+0xe8/0x164) (unreliable) [c0000000d1e33c80] [c000000000145680] sysrq_sysctl_handler+0x48/0xb8 [c0000000d1e33cd0] [c00000000006ab58] return_to_handler+0x0/0x40 (proc_sys_call_handler+0x274/0x2a0) [c0000000d1e33d60] [c00000000006ab58] return_to_handler+0x0/0x40 (__vfs_read+0x3c/0x70) [c0000000d1e33d80] [c00000000006ab58] return_to_handler+0x0/0x40 (vfs_read+0xb8/0x1b0) [c0000000d1e33dd0] [c00000000006ab58] return_to_handler+0x0/0x40 (ksys_read+0x7c/0x140) [c0000000d1e33e20] [c00000000006ab58] return_to_handler+0x0/0x40 (system_call+0x5c/0x68) Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/dc89c9a887121342d9c7819482c3dabdece2a323.1567707399.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/kernel/process.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 24621e7e5033..f289bdd2b562 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2047,10 +2047,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int count = 0; int firstframe = 1; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - struct ftrace_ret_stack *ret_stack; - extern void return_to_handler(void); - unsigned long rth = (unsigned long)return_to_handler; - int curr_frame = 0; + unsigned long ret_addr; + int ftrace_idx = 0; #endif if (tsk == NULL) @@ -2079,15 +2077,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) if (!firstframe || ip != lr) { printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if ((ip == rth) && curr_frame >= 0) { - ret_stack = ftrace_graph_get_ret_stack(current, - curr_frame++); - if (ret_stack) - pr_cont(" (%pS)", - (void *)ret_stack->ret); - else - curr_frame = -1; - } + ret_addr = ftrace_graph_ret_addr(current, + &ftrace_idx, ip, stack); + if (ret_addr != ip) + pr_cont(" (%pS)", (void *)ret_addr); #endif if (firstframe) pr_cont(" (unreliable)"); -- cgit v1.2.3 From d9101bfa6adc831bda8836c4d774820553c14942 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 18 Sep 2019 20:23:28 +0530 Subject: powerpc/mm/mce: Keep irqs disabled during lockless page table walk __find_linux_mm_pte() returns a page table entry pointer after walking the page table without holding locks. To make it safe against a THP split and/or collapse, we disable interrupts around the lockless page table walk. However we need to keep interrupts disabled as long as we use the page table entry pointer that is returned. Fix addr_to_pfn() to do that. Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors") Signed-off-by: Aneesh Kumar K.V [mpe: Rearrange code slightly and tweak change log wording] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190918145328.28602-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/kernel/mce_power.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 356e7b99f661..1cbf7f1a4e3d 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -29,7 +29,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { pte_t *ptep; unsigned int shift; - unsigned long flags; + unsigned long pfn, flags; struct mm_struct *mm; if (user_mode(regs)) @@ -39,18 +39,22 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) local_irq_save(flags); ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift); - local_irq_restore(flags); - if (!ptep || pte_special(*ptep)) - return ULONG_MAX; + if (!ptep || pte_special(*ptep)) { + pfn = ULONG_MAX; + goto out; + } - if (shift > PAGE_SHIFT) { + if (shift <= PAGE_SHIFT) + pfn = pte_pfn(*ptep); + else { unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; - - return pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask))); + pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask))); } - return pte_pfn(*ptep); +out: + local_irq_restore(flags); + return pfn; } /* flush SLBs and reload */ -- cgit v1.2.3