diff options
Diffstat (limited to 'arch/powerpc/sysdev')
-rw-r--r-- | arch/powerpc/sysdev/Kconfig | 1 | ||||
-rw-r--r-- | arch/powerpc/sysdev/Makefile | 1 | ||||
-rw-r--r-- | arch/powerpc/sysdev/axonram.c | 45 | ||||
-rw-r--r-- | arch/powerpc/sysdev/scom.c | 3 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xics/icp-hv.c | 2 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xics/icp-native.c | 20 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xics/icp-opal.c | 2 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xics/xics-common.c | 6 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/Kconfig | 11 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/Makefile | 4 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/common.c | 1302 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/native.c | 640 | ||||
-rw-r--r-- | arch/powerpc/sysdev/xive/xive-internal.h | 62 |
13 files changed, 2065 insertions, 34 deletions
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig index 52dc165c0efb..caf882e749dc 100644 --- a/arch/powerpc/sysdev/Kconfig +++ b/arch/powerpc/sysdev/Kconfig @@ -28,6 +28,7 @@ config PPC_MSI_BITMAP default y if PPC_POWERNV source "arch/powerpc/sysdev/xics/Kconfig" +source "arch/powerpc/sysdev/xive/Kconfig" config PPC_SCOM bool diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index a254824719f1..c0ae11d4f62f 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-$(CONFIG_PPC_XICS) += xics/ +obj-$(CONFIG_PPC_XIVE) += xive/ obj-$(CONFIG_GE_FPGA) += ge/ diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index f523ac883150..a7fe5fee744f 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -25,6 +25,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/dax.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/fs.h> @@ -62,6 +63,7 @@ static int azfs_major, azfs_minor; struct axon_ram_bank { struct platform_device *device; struct gendisk *disk; + struct dax_device *dax_dev; unsigned int irq_id; unsigned long ph_addr; unsigned long io_addr; @@ -137,25 +139,32 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) return BLK_QC_T_NONE; } -/** - * axon_ram_direct_access - direct_access() method for block device - * @device, @sector, @data: see block_device_operations method - */ +static const struct block_device_operations axon_ram_devops = { + .owner = THIS_MODULE, +}; + static long -axon_ram_direct_access(struct block_device *device, sector_t sector, - void **kaddr, pfn_t *pfn, long size) +__axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_pages, + void **kaddr, pfn_t *pfn) { - struct axon_ram_bank *bank = device->bd_disk->private_data; - loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; + resource_size_t offset = pgoff * PAGE_SIZE; *kaddr = (void *) bank->io_addr + offset; *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV); - return bank->size - offset; + return (bank->size - offset) / PAGE_SIZE; } -static const struct block_device_operations axon_ram_devops = { - .owner = THIS_MODULE, - .direct_access = axon_ram_direct_access +static long +axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, + void **kaddr, pfn_t *pfn) +{ + struct axon_ram_bank *bank = dax_get_private(dax_dev); + + return __axon_ram_direct_access(bank, pgoff, nr_pages, kaddr, pfn); +} + +static const struct dax_operations axon_ram_dax_ops = { + .direct_access = axon_ram_dax_direct_access, }; /** @@ -219,6 +228,7 @@ static int axon_ram_probe(struct platform_device *device) goto failed; } + bank->disk->major = azfs_major; bank->disk->first_minor = azfs_minor; bank->disk->fops = &axon_ram_devops; @@ -227,6 +237,13 @@ static int axon_ram_probe(struct platform_device *device) sprintf(bank->disk->disk_name, "%s%d", AXON_RAM_DEVICE_NAME, axon_ram_bank_id); + bank->dax_dev = alloc_dax(bank, bank->disk->disk_name, + &axon_ram_dax_ops); + if (!bank->dax_dev) { + rc = -ENOMEM; + goto failed; + } + bank->disk->queue = blk_alloc_queue(GFP_KERNEL); if (bank->disk->queue == NULL) { dev_err(&device->dev, "Cannot register disk queue\n"); @@ -278,6 +295,8 @@ failed: del_gendisk(bank->disk); put_disk(bank->disk); } + kill_dax(bank->dax_dev); + put_dax(bank->dax_dev); device->dev.platform_data = NULL; if (bank->io_addr != 0) iounmap((void __iomem *) bank->io_addr); @@ -300,6 +319,8 @@ axon_ram_remove(struct platform_device *device) device_remove_file(&device->dev, &dev_attr_ecc); free_irq(bank->irq_id, device); + kill_dax(bank->dax_dev); + put_dax(bank->dax_dev); del_gendisk(bank->disk); put_disk(bank->disk); iounmap((void __iomem *) bank->io_addr); diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c index d0e9f178a324..76ea32c1b664 100644 --- a/arch/powerpc/sysdev/scom.c +++ b/arch/powerpc/sysdev/scom.c @@ -19,10 +19,9 @@ */ #include <linux/kernel.h> -#include <linux/debugfs.h> #include <linux/slab.h> #include <linux/export.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/prom.h> #include <asm/scom.h> #include <linux/uaccess.h> diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c index e7fa26c4ff73..bbc839a98c41 100644 --- a/arch/powerpc/sysdev/xics/icp-hv.c +++ b/arch/powerpc/sysdev/xics/icp-hv.c @@ -138,7 +138,7 @@ static void icp_hv_set_cpu_priority(unsigned char cppr) #ifdef CONFIG_SMP -static void icp_hv_cause_ipi(int cpu, unsigned long data) +static void icp_hv_cause_ipi(int cpu) { icp_hv_set_qirr(cpu, IPI_PRIORITY); } diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 8a6a043e239b..2bfb9968d562 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -143,19 +143,9 @@ static unsigned int icp_native_get_irq(void) #ifdef CONFIG_SMP -static void icp_native_cause_ipi(int cpu, unsigned long data) +static void icp_native_cause_ipi(int cpu) { kvmppc_set_host_ipi(cpu, 1); -#ifdef CONFIG_PPC_DOORBELL - if (cpu_has_feature(CPU_FTR_DBELL)) { - if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) { - doorbell_cause_ipi(cpu, data); - put_cpu(); - return; - } - put_cpu(); - } -#endif icp_native_set_qirr(cpu, IPI_PRIORITY); } @@ -168,15 +158,15 @@ void icp_native_cause_ipi_rm(int cpu) * Need the physical address of the XICS to be * previously saved in kvm_hstate in the paca. */ - unsigned long xics_phys; + void __iomem *xics_phys; /* * Just like the cause_ipi functions, it is required to - * include a full barrier (out8 includes a sync) before - * causing the IPI. + * include a full barrier before causing the IPI. */ xics_phys = paca[cpu].kvm_hstate.xics_phys; - out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY); + mb(); + __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); } #endif diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c index b53f80f0b4d8..c71d2ea42627 100644 --- a/arch/powerpc/sysdev/xics/icp-opal.c +++ b/arch/powerpc/sysdev/xics/icp-opal.c @@ -126,7 +126,7 @@ static void icp_opal_eoi(struct irq_data *d) #ifdef CONFIG_SMP -static void icp_opal_cause_ipi(int cpu, unsigned long data) +static void icp_opal_cause_ipi(int cpu) { int hw_cpu = get_hard_smp_processor_id(cpu); diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 23efe4e42172..ffe138b8b9dc 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -143,11 +143,11 @@ static void xics_request_ipi(void) void __init xics_smp_probe(void) { - /* Setup cause_ipi callback based on which ICP is used */ - smp_ops->cause_ipi = icp_ops->cause_ipi; - /* Register all the IPIs */ xics_request_ipi(); + + /* Setup cause_ipi callback based on which ICP is used */ + smp_ops->cause_ipi = icp_ops->cause_ipi; } #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig new file mode 100644 index 000000000000..12ccd7373d2f --- /dev/null +++ b/arch/powerpc/sysdev/xive/Kconfig @@ -0,0 +1,11 @@ +config PPC_XIVE + bool + default n + select PPC_SMP_MUXED_IPI + select HARDIRQS_SW_RESEND + +config PPC_XIVE_NATIVE + bool + default n + select PPC_XIVE + depends on PPC_POWERNV diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile new file mode 100644 index 000000000000..3fab303fc169 --- /dev/null +++ b/arch/powerpc/sysdev/xive/Makefile @@ -0,0 +1,4 @@ +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +obj-y += common.o +obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c new file mode 100644 index 000000000000..6a98efb14264 --- /dev/null +++ b/arch/powerpc/sysdev/xive/common.c @@ -0,0 +1,1302 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/debugfs.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/msi.h> + +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/xmon.h> + +#include "xive-internal.h" + +#undef DEBUG_FLUSH +#undef DEBUG_ALL + +#ifdef DEBUG_ALL +#define DBG_VERBOSE(fmt...) pr_devel(fmt) +#else +#define DBG_VERBOSE(fmt...) do { } while(0) +#endif + +bool __xive_enabled; +bool xive_cmdline_disabled; + +/* We use only one priority for now */ +static u8 xive_irq_priority; + +/* TIMA */ +void __iomem *xive_tima; +u32 xive_tima_offset; + +/* Backend ops */ +static const struct xive_ops *xive_ops; + +/* Our global interrupt domain */ +static struct irq_domain *xive_irq_domain; + +#ifdef CONFIG_SMP +/* The IPIs all use the same logical irq number */ +static u32 xive_ipi_irq; +#endif + +/* Xive state for each CPU */ +static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); + +/* + * A "disabled" interrupt should never fire, to catch problems + * we set its logical number to this + */ +#define XIVE_BAD_IRQ 0x7fffffff +#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) + +/* An invalid CPU target */ +#define XIVE_INVALID_TARGET (-1) + +/* + * Read the next entry in a queue, return its content if it's valid + * or 0 if there is no new entry. + * + * The queue pointer is moved forward unless "just_peek" is set + */ +static u32 xive_read_eq(struct xive_q *q, bool just_peek) +{ + u32 cur; + + if (!q->qpage) + return 0; + cur = be32_to_cpup(q->qpage + q->idx); + + /* Check valid bit (31) vs current toggle polarity */ + if ((cur >> 31) == q->toggle) + return 0; + + /* If consuming from the queue ... */ + if (!just_peek) { + /* Next entry */ + q->idx = (q->idx + 1) & q->msk; + + /* Wrap around: flip valid toggle */ + if (q->idx == 0) + q->toggle ^= 1; + } + /* Mask out the valid bit (31) */ + return cur & 0x7fffffff; +} + +/* + * Scans all the queue that may have interrupts in them + * (based on "pending_prio") in priority order until an + * interrupt is found or all the queues are empty. + * + * Then updates the CPPR (Current Processor Priority + * Register) based on the most favored interrupt found + * (0xff if none) and return what was found (0 if none). + * + * If just_peek is set, return the most favored pending + * interrupt if any but don't update the queue pointers. + * + * Note: This function can operate generically on any number + * of queues (up to 8). The current implementation of the XIVE + * driver only uses a single queue however. + * + * Note2: This will also "flush" "the pending_count" of a queue + * into the "count" when that queue is observed to be empty. + * This is used to keep track of the amount of interrupts + * targetting a queue. When an interrupt is moved away from + * a queue, we only decrement that queue count once the queue + * has been observed empty to avoid races. + */ +static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) +{ + u32 irq = 0; + u8 prio; + + /* Find highest pending priority */ + while (xc->pending_prio != 0) { + struct xive_q *q; + + prio = ffs(xc->pending_prio) - 1; + DBG_VERBOSE("scan_irq: trying prio %d\n", prio); + + /* Try to fetch */ + irq = xive_read_eq(&xc->queue[prio], just_peek); + + /* Found something ? That's it */ + if (irq) + break; + + /* Clear pending bits */ + xc->pending_prio &= ~(1 << prio); + + /* + * Check if the queue count needs adjusting due to + * interrupts being moved away. See description of + * xive_dec_target_count() + */ + q = &xc->queue[prio]; + if (atomic_read(&q->pending_count)) { + int p = atomic_xchg(&q->pending_count, 0); + if (p) { + WARN_ON(p > atomic_read(&q->count)); + atomic_sub(p, &q->count); + } + } + } + + /* If nothing was found, set CPPR to 0xff */ + if (irq == 0) + prio = 0xff; + + /* Update HW CPPR to match if necessary */ + if (prio != xc->cppr) { + DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio); + xc->cppr = prio; + out_8(xive_tima + xive_tima_offset + TM_CPPR, prio); + } + + return irq; +} + +/* + * This is used to perform the magic loads from an ESB + * described in xive.h + */ +static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset) +{ + u64 val; + + /* Handle HW errata */ + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) + offset |= offset << 4; + + val = in_be64(xd->eoi_mmio + offset); + + return (u8)val; +} + +#ifdef CONFIG_XMON +static void xive_dump_eq(const char *name, struct xive_q *q) +{ + u32 i0, i1, idx; + + if (!q->qpage) + return; + idx = q->idx; + i0 = be32_to_cpup(q->qpage + idx); + idx = (idx + 1) & q->msk; + i1 = be32_to_cpup(q->qpage + idx); + xmon_printf(" %s Q T=%d %08x %08x ...\n", name, + q->toggle, i0, i1); +} + +void xmon_xive_do_dump(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + + xmon_printf("XIVE state for CPU %d:\n", cpu); + xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr); + xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); +#ifdef CONFIG_SMP + { + u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET); + xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, + val & XIVE_ESB_VAL_P ? 'P' : 'p', + val & XIVE_ESB_VAL_P ? 'Q' : 'q'); + } +#endif +} +#endif /* CONFIG_XMON */ + +static unsigned int xive_get_irq(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + u32 irq; + + /* + * This can be called either as a result of a HW interrupt or + * as a "replay" because EOI decided there was still something + * in one of the queues. + * + * First we perform an ACK cycle in order to update our mask + * of pending priorities. This will also have the effect of + * updating the CPPR to the most favored pending interrupts. + * + * In the future, if we have a way to differenciate a first + * entry (on HW interrupt) from a replay triggered by EOI, + * we could skip this on replays unless we soft-mask tells us + * that a new HW interrupt occurred. + */ + xive_ops->update_pending(xc); + + DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio); + + /* Scan our queue(s) for interrupts */ + irq = xive_scan_interrupts(xc, false); + + DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n", + irq, xc->pending_prio); + + /* Return pending interrupt if any */ + if (irq == XIVE_BAD_IRQ) + return 0; + return irq; +} + +/* + * After EOI'ing an interrupt, we need to re-check the queue + * to see if another interrupt is pending since multiple + * interrupts can coalesce into a single notification to the + * CPU. + * + * If we find that there is indeed more in there, we call + * force_external_irq_replay() to make Linux synthetize an + * external interrupt on the next call to local_irq_restore(). + */ +static void xive_do_queue_eoi(struct xive_cpu *xc) +{ + if (xive_scan_interrupts(xc, true) != 0) { + DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio); + force_external_irq_replay(); + } +} + +/* + * EOI an interrupt at the source. There are several methods + * to do this depending on the HW version and source type + */ +void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) +{ + /* If the XIVE supports the new "store EOI facility, use it */ + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + out_be64(xd->eoi_mmio, 0); + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { + /* + * The FW told us to call it. This happens for some + * interrupt sources that need additional HW whacking + * beyond the ESB manipulation. For example LPC interrupts + * on P9 DD1.0 need a latch to be clared in the LPC bridge + * itself. The Firmware will take care of it. + */ + if (WARN_ON_ONCE(!xive_ops->eoi)) + return; + xive_ops->eoi(hw_irq); + } else { + u8 eoi_val; + + /* + * Otherwise for EOI, we use the special MMIO that does + * a clear of both P and Q and returns the old Q, + * except for LSIs where we use the "EOI cycle" special + * load. + * + * This allows us to then do a re-trigger if Q was set + * rather than synthesizing an interrupt in software + * + * For LSIs, using the HW EOI cycle works around a problem + * on P9 DD1 PHBs where the other ESB accesses don't work + * properly. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) + in_be64(xd->eoi_mmio); + else { + eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); + DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val); + + /* Re-trigger if needed */ + if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio) + out_be64(xd->trig_mmio, 0); + } + } +} + +/* irq_chip eoi callback */ +static void xive_irq_eoi(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->pending_prio); + + /* EOI the source if it hasn't been disabled */ + if (!irqd_irq_disabled(d)) + xive_do_source_eoi(irqd_to_hwirq(d), xd); + + /* + * Clear saved_p to indicate that it's no longer occupying + * a queue slot on the target queue + */ + xd->saved_p = false; + + /* Check for more work in the queue */ + xive_do_queue_eoi(xc); +} + +/* + * Helper used to mask and unmask an interrupt source. This + * is only called for normal interrupts that do not require + * masking/unmasking via firmware. + */ +static void xive_do_source_set_mask(struct xive_irq_data *xd, + bool mask) +{ + u64 val; + + /* + * If the interrupt had P set, it may be in a queue. + * + * We need to make sure we don't re-enable it until it + * has been fetched from that queue and EOId. We keep + * a copy of that P state and use it to restore the + * ESB accordingly on unmask. + */ + if (mask) { + val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01); + xd->saved_p = !!(val & XIVE_ESB_VAL_P); + } else if (xd->saved_p) + xive_poke_esb(xd, XIVE_ESB_SET_PQ_10); + else + xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); +} + +/* + * Try to chose "cpu" as a new interrupt target. Increments + * the queue accounting for that target if it's not already + * full. + */ +static bool xive_try_pick_target(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + int max; + + /* + * Calculate max number of interrupts in that queue. + * + * We leave a gap of 1 just in case... + */ + max = (q->msk + 1) - 1; + return !!atomic_add_unless(&q->count, 1, max); +} + +/* + * Un-account an interrupt for a target CPU. We don't directly + * decrement q->count since the interrupt might still be present + * in the queue. + * + * Instead increment a separate counter "pending_count" which + * will be substracted from "count" later when that CPU observes + * the queue to be empty. + */ +static void xive_dec_target_count(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + + if (unlikely(WARN_ON(cpu < 0 || !xc))) { + pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); + return; + } + + /* + * We increment the "pending count" which will be used + * to decrement the target queue count whenever it's next + * processed and found empty. This ensure that we don't + * decrement while we still have the interrupt there + * occupying a slot. + */ + atomic_inc(&q->pending_count); +} + +/* Find a tentative CPU target in a CPU mask */ +static int xive_find_target_in_mask(const struct cpumask *mask, + unsigned int fuzz) +{ + int cpu, first, num, i; + + /* Pick up a starting point CPU in the mask based on fuzz */ + num = cpumask_weight(mask); + first = fuzz % num; + + /* Locate it */ + cpu = cpumask_first(mask); + for (i = 0; i < first && cpu < nr_cpu_ids; i++) + cpu = cpumask_next(cpu, mask); + + /* Sanity check */ + if (WARN_ON(cpu >= nr_cpu_ids)) + cpu = cpumask_first(cpu_online_mask); + + /* Remember first one to handle wrap-around */ + first = cpu; + + /* + * Now go through the entire mask until we find a valid + * target. + */ + for (;;) { + /* + * We re-check online as the fallback case passes us + * an untested affinity mask + */ + if (cpu_online(cpu) && xive_try_pick_target(cpu)) + return cpu; + cpu = cpumask_next(cpu, mask); + if (cpu == first) + break; + /* Wrap around */ + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(mask); + } + return -1; +} + +/* + * Pick a target CPU for an interrupt. This is done at + * startup or if the affinity is changed in a way that + * invalidates the current target. + */ +static int xive_pick_irq_target(struct irq_data *d, + const struct cpumask *affinity) +{ + static unsigned int fuzz; + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + cpumask_var_t mask; + int cpu = -1; + + /* + * If we have chip IDs, first we try to build a mask of + * CPUs matching the CPU and find a target in there + */ + if (xd->src_chip != XIVE_INVALID_CHIP_ID && + zalloc_cpumask_var(&mask, GFP_ATOMIC)) { + /* Build a mask of matching chip IDs */ + for_each_cpu_and(cpu, affinity, cpu_online_mask) { + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + if (xc->chip_id == xd->src_chip) + cpumask_set_cpu(cpu, mask); + } + /* Try to find a target */ + if (cpumask_empty(mask)) + cpu = -1; + else + cpu = xive_find_target_in_mask(mask, fuzz++); + free_cpumask_var(mask); + if (cpu >= 0) + return cpu; + fuzz--; + } + + /* No chip IDs, fallback to using the affinity mask */ + return xive_find_target_in_mask(affinity, fuzz++); +} + +static unsigned int xive_irq_startup(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int target, rc; + + pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", + d->irq, hw_irq, d); + +#ifdef CONFIG_PCI_MSI + /* + * The generic MSI code returns with the interrupt disabled on the + * card, using the MSI mask bits. Firmware doesn't appear to unmask + * at that level, so we do it here by hand. + */ + if (irq_data_get_msi_desc(d)) + pci_msi_unmask_irq(d); +#endif + + /* Pick a target */ + target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); + if (target == XIVE_INVALID_TARGET) { + /* Try again breaking affinity */ + target = xive_pick_irq_target(d, cpu_online_mask); + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + pr_warn("irq %d started with broken affinity\n", d->irq); + } + + /* Sanity check */ + if (WARN_ON(target == XIVE_INVALID_TARGET || + target >= nr_cpu_ids)) + target = smp_processor_id(); + + xd->target = target; + + /* + * Configure the logical number to be the Linux IRQ number + * and set the target queue + */ + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc) + return rc; + + /* Unmask the ESB */ + xive_do_source_set_mask(xd, false); + + return 0; +} + +static void xive_irq_shutdown(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n", + d->irq, hw_irq, d); + + if (WARN_ON(xd->target == XIVE_INVALID_TARGET)) + return; + + /* Mask the interrupt at the source */ + xive_do_source_set_mask(xd, true); + + /* + * The above may have set saved_p. We clear it otherwise it + * will prevent re-enabling later on. It is ok to forget the + * fact that the interrupt might be in a queue because we are + * accounting that already in xive_dec_target_count() and will + * be re-routing it to a new queue with proper accounting when + * it's started up again + */ + xd->saved_p = false; + + /* + * Mask the interrupt in HW in the IVT/EAS and set the number + * to be the "bad" IRQ number + */ + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + 0xff, XIVE_BAD_IRQ); + + xive_dec_target_count(xd->target); + xd->target = XIVE_INVALID_TARGET; +} + +static void xive_irq_unmask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd); + + /* + * This is a workaround for PCI LSI problems on P9, for + * these, we call FW to set the mask. The problems might + * be fixed by P9 DD2.0, if that is the case, firmware + * will no longer set that flag. + */ + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + xive_irq_priority, d->irq); + return; + } + + xive_do_source_set_mask(xd, false); +} + +static void xive_irq_mask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd); + + /* + * This is a workaround for PCI LSI problems on P9, for + * these, we call OPAL to set the mask. The problems might + * be fixed by P9 DD2.0, if that is the case, firmware + * will no longer set that flag. + */ + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + 0xff, d->irq); + return; + } + + xive_do_source_set_mask(xd, true); +} + +static int xive_irq_set_affinity(struct irq_data *d, + const struct cpumask *cpumask, + bool force) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + u32 target, old_target; + int rc = 0; + + pr_devel("xive_irq_set_affinity: irq %d\n", d->irq); + + /* Is this valid ? */ + if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) + return -EINVAL; + + /* + * If existing target is already in the new mask, and is + * online then do nothing. + */ + if (xd->target != XIVE_INVALID_TARGET && + cpu_online(xd->target) && + cpumask_test_cpu(xd->target, cpumask)) + return IRQ_SET_MASK_OK; + + /* Pick a new target */ + target = xive_pick_irq_target(d, cpumask); + + /* No target found */ + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + + /* Sanity check */ + if (WARN_ON(target >= nr_cpu_ids)) + target = smp_processor_id(); + + old_target = xd->target; + + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc < 0) { + pr_err("Error %d reconfiguring irq %d\n", rc, d->irq); + return rc; + } + + pr_devel(" target: 0x%x\n", target); + xd->target = target; + + /* Give up previous target */ + if (old_target != XIVE_INVALID_TARGET) + xive_dec_target_count(old_target); + + return IRQ_SET_MASK_OK; +} + +static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * We only support these. This has really no effect other than setting + * the corresponding descriptor bits mind you but those will in turn + * affect the resend function when re-enabling an edge interrupt. + * + * Set set the default to edge as explained in map(). + */ + if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE) + flow_type = IRQ_TYPE_EDGE_RISING; + + if (flow_type != IRQ_TYPE_EDGE_RISING && + flow_type != IRQ_TYPE_LEVEL_LOW) + return -EINVAL; + + irqd_set_trigger_type(d, flow_type); + + /* + * Double check it matches what the FW thinks + * + * NOTE: We don't know yet if the PAPR interface will provide + * the LSI vs MSI information apart from the device-tree so + * this check might have to move into an optional backend call + * that is specific to the native backend + */ + if ((flow_type == IRQ_TYPE_LEVEL_LOW) != + !!(xd->flags & XIVE_IRQ_FLAG_LSI)) { + pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n", + d->irq, (u32)irqd_to_hwirq(d), + (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge", + (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge"); + } + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static int xive_irq_retrigger(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* This should be only for MSIs */ + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) + return 0; + + /* + * To perform a retrigger, we first set the PQ bits to + * 11, then perform an EOI. + */ + xive_poke_esb(xd, XIVE_ESB_SET_PQ_11); + + /* + * Note: We pass "0" to the hw_irq argument in order to + * avoid calling into the backend EOI code which we don't + * want to do in the case of a re-trigger. Backends typically + * only do EOI for LSIs anyway. + */ + xive_do_source_eoi(0, xd); + + return 1; +} + +static struct irq_chip xive_irq_chip = { + .name = "XIVE-IRQ", + .irq_startup = xive_irq_startup, + .irq_shutdown = xive_irq_shutdown, + .irq_eoi = xive_irq_eoi, + .irq_mask = xive_irq_mask, + .irq_unmask = xive_irq_unmask, + .irq_set_affinity = xive_irq_set_affinity, + .irq_set_type = xive_irq_set_type, + .irq_retrigger = xive_irq_retrigger, +}; + +bool is_xive_irq(struct irq_chip *chip) +{ + return chip == &xive_irq_chip; +} + +void xive_cleanup_irq_data(struct xive_irq_data *xd) +{ + if (xd->eoi_mmio) { + iounmap(xd->eoi_mmio); + if (xd->eoi_mmio == xd->trig_mmio) + xd->trig_mmio = NULL; + xd->eoi_mmio = NULL; + } + if (xd->trig_mmio) { + iounmap(xd->trig_mmio); + xd->trig_mmio = NULL; + } +} + +static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) +{ + struct xive_irq_data *xd; + int rc; + + xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); + if (!xd) + return -ENOMEM; + rc = xive_ops->populate_irq_data(hw, xd); + if (rc) { + kfree(xd); + return rc; + } + xd->target = XIVE_INVALID_TARGET; + irq_set_handler_data(virq, xd); + + return 0; +} + +static void xive_irq_free_data(unsigned int virq) +{ + struct xive_irq_data *xd = irq_get_handler_data(virq); + + if (!xd) + return; + irq_set_handler_data(virq, NULL); + xive_cleanup_irq_data(xd); + kfree(xd); +} + +#ifdef CONFIG_SMP + +static void xive_cause_ipi(int cpu) +{ + struct xive_cpu *xc; + struct xive_irq_data *xd; + + xc = per_cpu(xive_cpu, cpu); + + DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n", + smp_processor_id(), cpu, xc->hw_ipi); + + xd = &xc->ipi_data; + if (WARN_ON(!xd->trig_mmio)) + return; + out_be64(xd->trig_mmio, 0); +} + +static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id) +{ + return smp_ipi_demux(); +} + +static void xive_ipi_eoi(struct irq_data *d) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* Handle possible race with unplug and drop stale IPIs */ + if (!xc) + return; + xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); + xive_do_queue_eoi(xc); +} + +static void xive_ipi_do_nothing(struct irq_data *d) +{ + /* + * Nothing to do, we never mask/unmask IPIs, but the callback + * has to exist for the struct irq_chip. + */ +} + +static struct irq_chip xive_ipi_chip = { + .name = "XIVE-IPI", + .irq_eoi = xive_ipi_eoi, + .irq_mask = xive_ipi_do_nothing, + .irq_unmask = xive_ipi_do_nothing, +}; + +static void __init xive_request_ipi(void) +{ + unsigned int virq; + + /* + * Initialization failed, move on, we might manage to + * reach the point where we display our errors before + * the system falls appart + */ + if (!xive_irq_domain) + return; + + /* Initialize it */ + virq = irq_create_mapping(xive_irq_domain, 0); + xive_ipi_irq = virq; + + WARN_ON(request_irq(virq, xive_muxed_ipi_action, + IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); +} + +static int xive_setup_cpu_ipi(unsigned int cpu) +{ + struct xive_cpu *xc; + int rc; + + pr_debug("Setting up IPI for CPU %d\n", cpu); + + xc = per_cpu(xive_cpu, cpu); + + /* Check if we are already setup */ + if (xc->hw_ipi != 0) + return 0; + + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ + if (xive_ops->get_ipi(cpu, xc)) + return -EIO; + + /* + * Populate the IRQ data in the xive_cpu structure and + * configure the HW / enable the IPIs. + */ + rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data); + if (rc) { + pr_err("Failed to populate IPI data on CPU %d\n", cpu); + return -EIO; + } + rc = xive_ops->configure_irq(xc->hw_ipi, + get_hard_smp_processor_id(cpu), + xive_irq_priority, xive_ipi_irq); + if (rc) { + pr_err("Failed to map IPI CPU %d\n", cpu); + return -EIO; + } + pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu, + xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); + + /* Unmask it */ + xive_do_source_set_mask(&xc->ipi_data, false); + + return 0; +} + +static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + /* Disable the IPI and free the IRQ data */ + + /* Already cleaned up ? */ + if (xc->hw_ipi == 0) + return; + + /* Mask the IPI */ + xive_do_source_set_mask(&xc->ipi_data, true); + + /* + * Note: We don't call xive_cleanup_irq_data() to free + * the mappings as this is called from an IPI on kexec + * which is not a safe environment to call iounmap() + */ + + /* Deconfigure/mask in the backend */ + xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(), + 0xff, xive_ipi_irq); + + /* Free the IPIs in the backend */ + xive_ops->put_ipi(cpu, xc); +} + +void __init xive_smp_probe(void) +{ + smp_ops->cause_ipi = xive_cause_ipi; + + /* Register the IPI */ + xive_request_ipi(); + + /* Allocate and setup IPI for the boot CPU */ + xive_setup_cpu_ipi(smp_processor_id()); +} + +#endif /* CONFIG_SMP */ + +static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, + irq_hw_number_t hw) +{ + int rc; + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + +#ifdef CONFIG_SMP + /* IPIs are special and come up with HW number 0 */ + if (hw == 0) { + /* + * IPIs are marked per-cpu. We use separate HW interrupts under + * the hood but associated with the same "linux" interrupt + */ + irq_set_chip_and_handler(virq, &xive_ipi_chip, + handle_percpu_irq); + return 0; + } +#endif + + rc = xive_irq_alloc_data(virq, hw); + if (rc) + return rc; + + irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); + + return 0; +} + +static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_get_irq_data(virq); + unsigned int hw_irq; + + /* XXX Assign BAD number */ + if (!data) + return; + hw_irq = (unsigned int)irqd_to_hwirq(data); + if (hw_irq) + xive_irq_free_data(virq); +} + +static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_flags) + +{ + *out_hwirq = intspec[0]; + + /* + * If intsize is at least 2, we look for the type in the second cell, + * we assume the LSB indicates a level interrupt. + */ + if (intsize > 1) { + if (intspec[1] & 1) + *out_flags = IRQ_TYPE_LEVEL_LOW; + else + *out_flags = IRQ_TYPE_EDGE_RISING; + } else + *out_flags = IRQ_TYPE_LEVEL_LOW; + + return 0; +} + +static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + return xive_ops->match(node); +} + +static const struct irq_domain_ops xive_irq_domain_ops = { + .match = xive_irq_domain_match, + .map = xive_irq_domain_map, + .unmap = xive_irq_domain_unmap, + .xlate = xive_irq_domain_xlate, +}; + +static void __init xive_init_host(void) +{ + xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ, + &xive_irq_domain_ops, NULL); + if (WARN_ON(xive_irq_domain == NULL)) + return; + irq_set_default_host(xive_irq_domain); +} + +static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + if (xc->queue[xive_irq_priority].qpage) + xive_ops->cleanup_queue(cpu, xc, xive_irq_priority); +} + +static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + int rc = 0; + + /* We setup 1 queues for now with a 64k page */ + if (!xc->queue[xive_irq_priority].qpage) + rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority); + + return rc; +} + +static int xive_prepare_cpu(unsigned int cpu) +{ + struct xive_cpu *xc; + + xc = per_cpu(xive_cpu, cpu); + if (!xc) { + struct device_node *np; + + xc = kzalloc_node(sizeof(struct xive_cpu), + GFP_KERNEL, cpu_to_node(cpu)); + if (!xc) + return -ENOMEM; + np = of_get_cpu_node(cpu, NULL); + if (np) + xc->chip_id = of_get_ibm_chip_id(np); + of_node_put(np); + + per_cpu(xive_cpu, cpu) = xc; + } + + /* Setup EQs if not already */ + return xive_setup_cpu_queues(cpu, xc); +} + +static void xive_setup_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* Debug: Dump the TM state */ + pr_devel("CPU %d [HW 0x%02x] VT=%02x\n", + smp_processor_id(), hard_smp_processor_id(), + in_8(xive_tima + xive_tima_offset + TM_WORD2)); + + /* The backend might have additional things to do */ + if (xive_ops->setup_cpu) + xive_ops->setup_cpu(smp_processor_id(), xc); + + /* Set CPPR to 0xff to enable flow of interrupts */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +#ifdef CONFIG_SMP +void xive_smp_setup_cpu(void) +{ + pr_devel("SMP setup CPU %d\n", smp_processor_id()); + + /* This will have already been done on the boot CPU */ + if (smp_processor_id() != boot_cpuid) + xive_setup_cpu(); + +} + +int xive_smp_prepare_cpu(unsigned int cpu) +{ + int rc; + + /* Allocate per-CPU data and queues */ + rc = xive_prepare_cpu(cpu); + if (rc) + return rc; + + /* Allocate and setup IPI for the new CPU */ + return xive_setup_cpu_ipi(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) +{ + u32 irq; + + /* We assume local irqs are disabled */ + WARN_ON(!irqs_disabled()); + + /* Check what's already in the CPU queue */ + while ((irq = xive_scan_interrupts(xc, false)) != 0) { + /* + * We need to re-route that interrupt to its new destination. + * First get and lock the descriptor + */ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *d = irq_desc_get_irq_data(desc); + struct xive_irq_data *xd; + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + /* + * Ignore anything that isn't a XIVE irq and ignore + * IPIs, so can just be dropped. + */ + if (d->domain != xive_irq_domain || hw_irq == 0) + continue; + + /* + * The IRQ should have already been re-routed, it's just a + * stale in the old queue, so re-trigger it in order to make + * it reach is new destination. + */ +#ifdef DEBUG_FLUSH + pr_info("CPU %d: Got irq %d while offline, re-sending...\n", + cpu, irq); +#endif + raw_spin_lock(&desc->lock); + xd = irq_desc_get_handler_data(desc); + + /* + * For LSIs, we EOI, this will cause a resend if it's + * still asserted. Otherwise do an MSI retrigger. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) + xive_do_source_eoi(irqd_to_hwirq(d), xd); + else + xive_irq_retrigger(d); + + raw_spin_unlock(&desc->lock); + } +} + +void xive_smp_disable_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Migrate interrupts away from the CPU */ + irq_migrate_all_off_this_cpu(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + /* Flush everything still in the queue */ + xive_flush_cpu_queue(cpu, xc); + + /* Re-enable CPPR */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +void xive_flush_interrupt(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Called if an interrupt occurs while the CPU is hot unplugged */ + xive_flush_cpu_queue(cpu, xc); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#endif /* CONFIG_SMP */ + +void xive_kexec_teardown_cpu(int secondary) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + /* Backend cleanup if any */ + if (xive_ops->teardown_cpu) + xive_ops->teardown_cpu(cpu, xc); + +#ifdef CONFIG_SMP + /* Get rid of IPI */ + xive_cleanup_cpu_ipi(cpu, xc); +#endif + + /* Disable and free the queues */ + xive_cleanup_cpu_queues(cpu, xc); +} + +void xive_shutdown(void) +{ + xive_ops->shutdown(); +} + +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, + u8 max_prio) +{ + xive_tima = area; + xive_tima_offset = offset; + xive_ops = ops; + xive_irq_priority = max_prio; + + ppc_md.get_irq = xive_get_irq; + __xive_enabled = true; + + pr_devel("Initializing host..\n"); + xive_init_host(); + + pr_devel("Initializing boot CPU..\n"); + + /* Allocate per-CPU data and queues */ + xive_prepare_cpu(smp_processor_id()); + + /* Get ready for interrupts */ + xive_setup_cpu(); + + pr_info("Interrupt handling intialized with %s backend\n", + xive_ops->name); + pr_info("Using priority %d for all interrupts\n", max_prio); + + return true; +} + +static int __init xive_off(char *arg) +{ + xive_cmdline_disabled = true; + return 0; +} +__setup("xive=off", xive_off); diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c new file mode 100644 index 000000000000..1a726229a427 --- /dev/null +++ b/arch/powerpc/sysdev/xive/native.c @@ -0,0 +1,640 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include <linux/types.h> +#include <linux/irq.h> +#include <linux/debugfs.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/of.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/cpumask.h> +#include <linux/mm.h> + +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/irq.h> +#include <asm/errno.h> +#include <asm/xive.h> +#include <asm/xive-regs.h> +#include <asm/opal.h> + +#include "xive-internal.h" + + +static u32 xive_provision_size; +static u32 *xive_provision_chips; +static u32 xive_provision_chip_count; +static u32 xive_queue_shift; +static u32 xive_pool_vps = XIVE_INVALID_VP; +static struct kmem_cache *xive_provision_cache; + +int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) +{ + __be64 flags, eoi_page, trig_page; + __be32 esb_shift, src_chip; + u64 opal_flags; + s64 rc; + + memset(data, 0, sizeof(*data)); + + rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page, + &esb_shift, &src_chip); + if (rc) { + pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n", + hw_irq, rc); + return -EINVAL; + } + + opal_flags = be64_to_cpu(flags); + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (opal_flags & OPAL_XIVE_IRQ_LSI) + data->flags |= XIVE_IRQ_FLAG_LSI; + if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG) + data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG; + if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW) + data->flags |= XIVE_IRQ_FLAG_MASK_FW; + if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW) + data->flags |= XIVE_IRQ_FLAG_EOI_FW; + data->eoi_page = be64_to_cpu(eoi_page); + data->trig_page = be64_to_cpu(trig_page); + data->esb_shift = be32_to_cpu(esb_shift); + data->src_chip = be32_to_cpu(src_chip); + + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); + if (!data->eoi_mmio) { + pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + + if (!data->trig_page) + return 0; + if (data->trig_page == data->eoi_page) { + data->trig_mmio = data->eoi_mmio; + return 0; + } + + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); + if (!data->trig_mmio) { + pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + return 0; +} + +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) +{ + s64 rc; + + for (;;) { + rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + return rc == 0 ? 0 : -ENXIO; +} + +/* This can be called multiple time to change a queue configuration */ +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order, bool can_escalate) +{ + s64 rc = 0; + __be64 qeoi_page_be; + __be32 esc_irq_be; + u64 flags, qpage_phys; + + /* If there's an actual queue page, clean it */ + if (order) { + if (WARN_ON(!qpage)) + return -EINVAL; + qpage_phys = __pa(qpage); + } else + qpage_phys = 0; + + /* Initialize the rest of the fields */ + q->msk = order ? ((1u << (order - 2)) - 1) : 0; + q->idx = 0; + q->toggle = 0; + + rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL, + &qeoi_page_be, + &esc_irq_be, + NULL); + if (rc) { + pr_err("Error %lld getting queue info prio %d\n", rc, prio); + rc = -EIO; + goto fail; + } + q->eoi_phys = be64_to_cpu(qeoi_page_be); + + /* Default flags */ + flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED; + + /* Escalation needed ? */ + if (can_escalate) { + q->esc_irq = be32_to_cpu(esc_irq_be); + flags |= OPAL_XIVE_EQ_ESCALATE; + } + + /* Configure and enable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) { + pr_err("Error %lld setting queue for prio %d\n", rc, prio); + rc = -EIO; + } else { + /* + * KVM code requires all of the above to be visible before + * q->qpage is set due to how it manages IPI EOIs + */ + wmb(); + q->qpage = qpage; + } +fail: + return rc; +} + +static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + s64 rc; + + /* Disable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) + pr_err("Error %lld disabling queue for prio %d\n", rc, prio); +} + +void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + __xive_native_disable_queue(vp_id, q, prio); +} + +static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + struct page *pages; + __be32 *qpage; + + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? + (xive_queue_shift - PAGE_SHIFT) : 0; + pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order); + if (!pages) + return -ENOMEM; + qpage = (__be32 *)page_address(pages); + memset(qpage, 0, 1 << xive_queue_shift); + return xive_native_configure_queue(get_hard_smp_processor_id(cpu), + q, prio, qpage, xive_queue_shift, false); +} + +static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + + /* + * We use the variant with no iounmap as this is called on exec + * from an IPI and iounmap isn't safe + */ + __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio); + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? + (xive_queue_shift - PAGE_SHIFT) : 0; + free_pages((unsigned long)q->qpage, alloc_order); + q->qpage = NULL; +} + +static bool xive_native_match(struct device_node *node) +{ + return of_device_is_compatible(node, "ibm,opal-xive-vc"); +} + +#ifdef CONFIG_SMP +static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + struct device_node *np; + unsigned int chip_id; + s64 irq; + + /* Find the chip ID */ + np = of_get_cpu_node(cpu, NULL); + if (np) { + if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0) + chip_id = 0; + } + + /* Allocate an IPI and populate info about it */ + for (;;) { + irq = opal_xive_allocate_irq(chip_id); + if (irq == OPAL_BUSY) { + msleep(1); + continue; + } + if (irq < 0) { + pr_err("Failed to allocate IPI on CPU %d\n", cpu); + return -ENXIO; + } + xc->hw_ipi = irq; + break; + } + return 0; +} + +u32 xive_native_alloc_irq(void) +{ + s64 rc; + + for (;;) { + rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc < 0) + return 0; + return rc; +} + +void xive_native_free_irq(u32 irq) +{ + for (;;) { + s64 rc = opal_xive_free_irq(irq); + if (rc != OPAL_BUSY) + break; + msleep(1); + } +} + +static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + + /* Free the IPI */ + if (!xc->hw_ipi) + return; + for (;;) { + rc = opal_xive_free_irq(xc->hw_ipi); + if (rc == OPAL_BUSY) { + msleep(1); + continue; + } + xc->hw_ipi = 0; + break; + } +} +#endif /* CONFIG_SMP */ + +static void xive_native_shutdown(void) +{ + /* Switch the XIVE to emulation mode */ + opal_xive_reset(OPAL_XIVE_MODE_EMU); +} + +/* + * Perform an "ack" cycle on the current thread, thus + * grabbing the pending active priorities and updating + * the CPPR to the most favored one. + */ +static void xive_native_update_pending(struct xive_cpu *xc) +{ + u8 he, cppr; + u16 ack; + + /* Perform the acknowledge hypervisor to register cycle */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* + * Grab the CPPR and the "HE" field which indicates the source + * of the hypervisor interrupt (if any) + */ + cppr = ack & 0xff; + he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8)); + switch(he) { + case TM_QW3_NSR_HE_NONE: /* Nothing to see here */ + break; + case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */ + if (cppr == 0xff) + return; + /* Mark the priority pending */ + xc->pending_prio |= 1 << cppr; + + /* + * A new interrupt should never have a CPPR less favored + * than our current one. + */ + if (cppr >= xc->cppr) + pr_err("CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->cppr); + + /* Update our idea of what the CPPR is */ + xc->cppr = cppr; + break; + case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */ + case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */ + pr_err("CPU %d got unexpected interrupt type HE=%d\n", + smp_processor_id(), he); + return; + } +} + +static void xive_native_eoi(u32 hw_irq) +{ + /* + * Not normally used except if specific interrupts need + * a workaround on EOI. + */ + opal_int_eoi(hw_irq); +} + +static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + __be64 vp_cam_be; + u64 vp_cam; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Enable the pool VP */ + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); + pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp); + for (;;) { + rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) { + pr_err("Failed to enable pool VP on CPU %d\n", cpu); + return; + } + + /* Grab it's CAM value */ + rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL); + if (rc) { + pr_err("Failed to get pool VP info CPU %d\n", cpu); + return; + } + vp_cam = be64_to_cpu(vp_cam_be); + + pr_debug("VP CAM = %llx\n", vp_cam); + + /* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */ + pr_debug("(Old HW value: %08x)\n", + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff); + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2, + TM_QW2W2_VP | vp_cam); + pr_debug("(New HW value: %08x)\n", + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); +} + +static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Pull the pool VP from the CPU */ + in_be64(xive_tima + TM_SPC_PULL_POOL_CTX); + + /* Disable it */ + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); + for (;;) { + rc = opal_xive_set_vp_info(vp, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } +} + +static void xive_native_sync_source(u32 hw_irq) +{ + opal_xive_sync(XIVE_SYNC_EAS, hw_irq); +} + +static const struct xive_ops xive_native_ops = { + .populate_irq_data = xive_native_populate_irq_data, + .configure_irq = xive_native_configure_irq, + .setup_queue = xive_native_setup_queue, + .cleanup_queue = xive_native_cleanup_queue, + .match = xive_native_match, + .shutdown = xive_native_shutdown, + .update_pending = xive_native_update_pending, + .eoi = xive_native_eoi, + .setup_cpu = xive_native_setup_cpu, + .teardown_cpu = xive_native_teardown_cpu, + .sync_source = xive_native_sync_source, +#ifdef CONFIG_SMP + .get_ipi = xive_native_get_ipi, + .put_ipi = xive_native_put_ipi, +#endif /* CONFIG_SMP */ + .name = "native", +}; + +static bool xive_parse_provisioning(struct device_node *np) +{ + int rc; + + if (of_property_read_u32(np, "ibm,xive-provision-page-size", + &xive_provision_size) < 0) + return true; + rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4); + if (rc < 0) { + pr_err("Error %d getting provision chips array\n", rc); + return false; + } + xive_provision_chip_count = rc; + if (rc == 0) + return true; + + xive_provision_chips = kzalloc(4 * xive_provision_chip_count, + GFP_KERNEL); + if (WARN_ON(!xive_provision_chips)) + return false; + + rc = of_property_read_u32_array(np, "ibm,xive-provision-chips", + xive_provision_chips, + xive_provision_chip_count); + if (rc < 0) { + pr_err("Error %d reading provision chips array\n", rc); + return false; + } + + xive_provision_cache = kmem_cache_create("xive-provision", + xive_provision_size, + xive_provision_size, + 0, NULL); + if (!xive_provision_cache) { + pr_err("Failed to allocate provision cache\n"); + return false; + } + return true; +} + +u32 xive_native_default_eq_shift(void) +{ + return xive_queue_shift; +} + +bool xive_native_init(void) +{ + struct device_node *np; + struct resource r; + void __iomem *tima; + struct property *prop; + u8 max_prio = 7; + const __be32 *p; + u32 val; + s64 rc; + + if (xive_cmdline_disabled) + return false; + + pr_devel("xive_native_init()\n"); + np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe"); + if (!np) { + pr_devel("not found !\n"); + return false; + } + pr_devel("Found %s\n", np->full_name); + + /* Resource 1 is HV window */ + if (of_address_to_resource(np, 1, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + return false; + } + tima = ioremap(r.start, resource_size(&r)); + if (!tima) { + pr_err("Failed to map thread mgmnt area\n"); + return false; + } + + /* Read number of priorities */ + if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0) + max_prio = val - 1; + + /* Iterate the EQ sizes and pick one */ + of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) { + xive_queue_shift = val; + if (val == PAGE_SHIFT) + break; + } + + /* Grab size of provisioning pages */ + xive_parse_provisioning(np); + + /* Switch the XIVE to exploitation mode */ + rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL); + if (rc) { + pr_err("Switch to exploitation mode failed with error %lld\n", rc); + return false; + } + + /* Initialize XIVE core with our backend */ + if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS, + max_prio)) { + opal_xive_reset(OPAL_XIVE_MODE_EMU); + return false; + } + pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10)); + return true; +} + +static bool xive_native_provision_pages(void) +{ + u32 i; + void *p; + + for (i = 0; i < xive_provision_chip_count; i++) { + u32 chip = xive_provision_chips[i]; + + /* + * XXX TODO: Try to make the allocation local to the node where + * the chip resides. + */ + p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL); + if (!p) { + pr_err("Failed to allocate provisioning page\n"); + return false; + } + opal_xive_donate_page(chip, __pa(p)); + } + return true; +} + +u32 xive_native_alloc_vp_block(u32 max_vcpus) +{ + s64 rc; + u32 order; + + order = fls(max_vcpus) - 1; + if (max_vcpus > (1 << order)) + order++; + + pr_info("VP block alloc, for max VCPUs %d use order %d\n", + max_vcpus, order); + + for (;;) { + rc = opal_xive_alloc_vp_block(order); + switch (rc) { + case OPAL_BUSY: + msleep(1); + break; + case OPAL_XIVE_PROVISIONING: + if (!xive_native_provision_pages()) + return XIVE_INVALID_VP; + break; + default: + if (rc < 0) { + pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n", + order, rc); + return XIVE_INVALID_VP; + } + return rc; + } + } +} +EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block); + +void xive_native_free_vp_block(u32 vp_base) +{ + s64 rc; + + if (vp_base == XIVE_INVALID_VP) + return; + + rc = opal_xive_free_vp_block(vp_base); + if (rc < 0) + pr_warn("OPAL error %lld freeing VP block\n", rc); +} +EXPORT_SYMBOL_GPL(xive_native_free_vp_block); diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h new file mode 100644 index 000000000000..d07ef2d29caf --- /dev/null +++ b/arch/powerpc/sysdev/xive/xive-internal.h @@ -0,0 +1,62 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __XIVE_INTERNAL_H +#define __XIVE_INTERNAL_H + +/* Each CPU carry one of these with various per-CPU state */ +struct xive_cpu { +#ifdef CONFIG_SMP + /* HW irq number and data of IPI */ + u32 hw_ipi; + struct xive_irq_data ipi_data; +#endif /* CONFIG_SMP */ + + int chip_id; + + /* Queue datas. Only one is populated */ +#define XIVE_MAX_QUEUES 8 + struct xive_q queue[XIVE_MAX_QUEUES]; + + /* + * Pending mask. Each bit corresponds to a priority that + * potentially has pending interrupts. + */ + u8 pending_prio; + + /* Cache of HW CPPR */ + u8 cppr; +}; + +/* Backend ops */ +struct xive_ops { + int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data); + int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); + int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); + void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc); + bool (*match)(struct device_node *np); + void (*shutdown)(void); + + void (*update_pending)(struct xive_cpu *xc); + void (*eoi)(u32 hw_irq); + void (*sync_source)(u32 hw_irq); +#ifdef CONFIG_SMP + int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc); + void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc); +#endif + const char *name; +}; + +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, + u8 max_prio); + +extern bool xive_cmdline_disabled; + +#endif /* __XIVE_INTERNAL_H */ |