diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
19 files changed, 937 insertions, 878 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 703a350a7f4e..b540ce8eec55 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -6,7 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o -obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o +obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index ddfc3544d285..3c1beae29f2d 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -223,6 +223,14 @@ int pnv_eeh_post_init(void) eeh_probe_devices(); eeh_addr_cache_build(); + if (eeh_has_flag(EEH_POSTPONED_PROBE)) { + eeh_clear_flag(EEH_POSTPONED_PROBE); + if (eeh_enabled()) + pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + pr_info("EEH: No capable adapters found\n"); + } + /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); if (eeh_event_irq < 0) { @@ -384,8 +392,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) return NULL; /* Skip if we haven't probed yet */ - if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) + if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) { + eeh_add_flag(EEH_POSTPONED_PROBE); return NULL; + } /* Initialize eeh device */ edev->class_code = pdn->class_code; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 1c5d0675b43c..35f699ebb662 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -36,6 +36,8 @@ #define P9_STOP_SPR_PSSCR 855 static u32 supported_cpuidle_states; +struct pnv_idle_states_t *pnv_idle_states; +int nr_pnv_idle_states; /* * The default stop state that will be used by ppc_md.power_save @@ -177,11 +179,6 @@ static void pnv_alloc_idle_core_states(void) paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state; paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING; paca_ptrs[cpu]->thread_mask = 1 << j; - if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) - continue; - paca_ptrs[cpu]->thread_sibling_pacas = - kmalloc_node(paca_ptr_array_size, - GFP_KERNEL, node); } } @@ -622,48 +619,10 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) * @dt_idle_states: Number of idle state entries * Returns 0 on success */ -static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, - int dt_idle_states) +static int __init pnv_power9_idle_init(void) { - u64 *psscr_val = NULL; - u64 *psscr_mask = NULL; - u32 *residency_ns = NULL; u64 max_residency_ns = 0; - int rc = 0, i; - - psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); - psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); - residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns), - GFP_KERNEL); - - if (!psscr_val || !psscr_mask || !residency_ns) { - rc = -1; - goto out; - } - - if (of_property_read_u64_array(np, - "ibm,cpu-idle-state-psscr", - psscr_val, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n"); - rc = -1; - goto out; - } - - if (of_property_read_u64_array(np, - "ibm,cpu-idle-state-psscr-mask", - psscr_mask, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n"); - rc = -1; - goto out; - } - - if (of_property_read_u32_array(np, - "ibm,cpu-idle-state-residency-ns", - residency_ns, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n"); - rc = -1; - goto out; - } + int i; /* * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask}, @@ -679,33 +638,37 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state. */ pnv_first_deep_stop_state = MAX_STOP_STATE; - for (i = 0; i < dt_idle_states; i++) { + for (i = 0; i < nr_pnv_idle_states; i++) { int err; - u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK; + struct pnv_idle_states_t *state = &pnv_idle_states[i]; + u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK; - if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) && - (pnv_first_deep_stop_state > psscr_rl)) + if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) && + pnv_first_deep_stop_state > psscr_rl) pnv_first_deep_stop_state = psscr_rl; - err = validate_psscr_val_mask(&psscr_val[i], &psscr_mask[i], - flags[i]); + err = validate_psscr_val_mask(&state->psscr_val, + &state->psscr_mask, + state->flags); if (err) { - report_invalid_psscr_val(psscr_val[i], err); + report_invalid_psscr_val(state->psscr_val, err); continue; } - if (max_residency_ns < residency_ns[i]) { - max_residency_ns = residency_ns[i]; - pnv_deepest_stop_psscr_val = psscr_val[i]; - pnv_deepest_stop_psscr_mask = psscr_mask[i]; - pnv_deepest_stop_flag = flags[i]; + state->valid = true; + + if (max_residency_ns < state->residency_ns) { + max_residency_ns = state->residency_ns; + pnv_deepest_stop_psscr_val = state->psscr_val; + pnv_deepest_stop_psscr_mask = state->psscr_mask; + pnv_deepest_stop_flag = state->flags; deepest_stop_found = true; } if (!default_stop_found && - (flags[i] & OPAL_PM_STOP_INST_FAST)) { - pnv_default_stop_val = psscr_val[i]; - pnv_default_stop_mask = psscr_mask[i]; + (state->flags & OPAL_PM_STOP_INST_FAST)) { + pnv_default_stop_val = state->psscr_val; + pnv_default_stop_mask = state->psscr_mask; default_stop_found = true; } } @@ -728,11 +691,8 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", pnv_first_deep_stop_state); -out: - kfree(psscr_val); - kfree(psscr_mask); - kfree(residency_ns); - return rc; + + return 0; } /* @@ -740,50 +700,146 @@ out: */ static void __init pnv_probe_idle_states(void) { - struct device_node *np; - int dt_idle_states; - u32 *flags = NULL; int i; + if (nr_pnv_idle_states < 0) { + pr_warn("cpuidle-powernv: no idle states found in the DT\n"); + return; + } + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (pnv_power9_idle_init()) + return; + } + + for (i = 0; i < nr_pnv_idle_states; i++) + supported_cpuidle_states |= pnv_idle_states[i].flags; +} + +/* + * This function parses device-tree and populates all the information + * into pnv_idle_states structure. It also sets up nr_pnv_idle_states + * which is the number of cpuidle states discovered through device-tree. + */ + +static int pnv_parse_cpuidle_dt(void) +{ + struct device_node *np; + int nr_idle_states, i; + int rc = 0; + u32 *temp_u32; + u64 *temp_u64; + const char **temp_string; + np = of_find_node_by_path("/ibm,opal/power-mgt"); if (!np) { pr_warn("opal: PowerMgmt Node not found\n"); - goto out; + return -ENODEV; } - dt_idle_states = of_property_count_u32_elems(np, - "ibm,cpu-idle-state-flags"); - if (dt_idle_states < 0) { - pr_warn("cpuidle-powernv: no idle states found in the DT\n"); + nr_idle_states = of_property_count_u32_elems(np, + "ibm,cpu-idle-state-flags"); + + pnv_idle_states = kcalloc(nr_idle_states, sizeof(*pnv_idle_states), + GFP_KERNEL); + temp_u32 = kcalloc(nr_idle_states, sizeof(u32), GFP_KERNEL); + temp_u64 = kcalloc(nr_idle_states, sizeof(u64), GFP_KERNEL); + temp_string = kcalloc(nr_idle_states, sizeof(char *), GFP_KERNEL); + + if (!(pnv_idle_states && temp_u32 && temp_u64 && temp_string)) { + pr_err("Could not allocate memory for dt parsing\n"); + rc = -ENOMEM; goto out; } - flags = kcalloc(dt_idle_states, sizeof(*flags), GFP_KERNEL); - - if (of_property_read_u32_array(np, - "ibm,cpu-idle-state-flags", flags, dt_idle_states)) { + /* Read flags */ + if (of_property_read_u32_array(np, "ibm,cpu-idle-state-flags", + temp_u32, nr_idle_states)) { pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n"); + rc = -EINVAL; + goto out; + } + for (i = 0; i < nr_idle_states; i++) + pnv_idle_states[i].flags = temp_u32[i]; + + /* Read latencies */ + if (of_property_read_u32_array(np, "ibm,cpu-idle-state-latencies-ns", + temp_u32, nr_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n"); + rc = -EINVAL; + goto out; + } + for (i = 0; i < nr_idle_states; i++) + pnv_idle_states[i].latency_ns = temp_u32[i]; + + /* Read residencies */ + if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns", + temp_u32, nr_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n"); + rc = -EINVAL; goto out; } + for (i = 0; i < nr_idle_states; i++) + pnv_idle_states[i].residency_ns = temp_u32[i]; + /* For power9 */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (pnv_power9_idle_init(np, flags, dt_idle_states)) + /* Read pm_crtl_val */ + if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr", + temp_u64, nr_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n"); + rc = -EINVAL; + goto out; + } + for (i = 0; i < nr_idle_states; i++) + pnv_idle_states[i].psscr_val = temp_u64[i]; + + /* Read pm_crtl_mask */ + if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr-mask", + temp_u64, nr_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n"); + rc = -EINVAL; goto out; + } + for (i = 0; i < nr_idle_states; i++) + pnv_idle_states[i].psscr_mask = temp_u64[i]; } - for (i = 0; i < dt_idle_states; i++) - supported_cpuidle_states |= flags[i]; + /* + * power8 specific properties ibm,cpu-idle-state-pmicr-mask and + * ibm,cpu-idle-state-pmicr-val were never used and there is no + * plan to use it in near future. Hence, not parsing these properties + */ + if (of_property_read_string_array(np, "ibm,cpu-idle-state-names", + temp_string, nr_idle_states) < 0) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-names in DT\n"); + rc = -EINVAL; + goto out; + } + for (i = 0; i < nr_idle_states; i++) + strlcpy(pnv_idle_states[i].name, temp_string[i], + PNV_IDLE_NAME_LEN); + nr_pnv_idle_states = nr_idle_states; + rc = 0; out: - kfree(flags); + kfree(temp_u32); + kfree(temp_u64); + kfree(temp_string); + return rc; } + static int __init pnv_init_idle_states(void) { - + int rc = 0; supported_cpuidle_states = 0; + /* In case we error out nr_pnv_idle_states will be zero */ + nr_pnv_idle_states = 0; if (cpuidle_disable != IDLE_NO_OVERRIDE) goto out; - + rc = pnv_parse_cpuidle_dt(); + if (rc) + return rc; pnv_probe_idle_states(); if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { @@ -805,29 +861,6 @@ static int __init pnv_init_idle_states(void) pnv_alloc_idle_core_states(); - /* - * For each CPU, record its PACA address in each of it's - * sibling thread's PACA at the slot corresponding to this - * CPU's index in the core. - */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - int cpu; - - pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n"); - for_each_present_cpu(cpu) { - int base_cpu = cpu_first_thread_sibling(cpu); - int idx = cpu_thread_in_core(cpu); - int i; - - for (i = 0; i < threads_per_core; i++) { - int j = base_cpu + i; - - paca_ptrs[j]->thread_sibling_pacas[idx] = - paca_ptrs[cpu]; - } - } - } - if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) ppc_md.power_save = power7_idle; diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index b99283df8584..51dc398ae3f7 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -47,38 +47,9 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size); } -static bool valid_memtrace_range(struct memtrace_entry *dev, - unsigned long start, unsigned long size) -{ - if ((start >= dev->start) && - ((start + size) <= (dev->start + dev->size))) - return true; - - return false; -} - -static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long size = vma->vm_end - vma->vm_start; - struct memtrace_entry *dev = filp->private_data; - - if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size)) - return -EINVAL; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - if (remap_pfn_range(vma, vma->vm_start, - vma->vm_pgoff + (dev->start >> PAGE_SHIFT), - size, vma->vm_page_prot)) - return -EAGAIN; - - return 0; -} - static const struct file_operations memtrace_fops = { .llseek = default_llseek, .read = memtrace_read, - .mmap = memtrace_mmap, .open = simple_open, }; @@ -206,8 +177,11 @@ static int memtrace_init_debugfs(void) snprintf(ent->name, 16, "%08x", ent->nid); dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir); - if (!dir) + if (!dir) { + pr_err("Failed to create debugfs directory for node %d\n", + ent->nid); return -1; + } ent->dir = dir; debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops); @@ -218,18 +192,93 @@ static int memtrace_init_debugfs(void) return ret; } +static int online_mem_block(struct memory_block *mem, void *arg) +{ + return device_online(&mem->dev); +} + +/* + * Iterate through the chunks of memory we have removed from the kernel + * and attempt to add them back to the kernel. + */ +static int memtrace_online(void) +{ + int i, ret = 0; + struct memtrace_entry *ent; + + for (i = memtrace_array_nr - 1; i >= 0; i--) { + ent = &memtrace_array[i]; + + /* We have onlined this chunk previously */ + if (ent->nid == -1) + continue; + + /* Remove from io mappings */ + if (ent->mem) { + iounmap(ent->mem); + ent->mem = 0; + } + + if (add_memory(ent->nid, ent->start, ent->size)) { + pr_err("Failed to add trace memory to node %d\n", + ent->nid); + ret += 1; + continue; + } + + /* + * If kernel isn't compiled with the auto online option + * we need to online the memory ourselves. + */ + if (!memhp_auto_online) { + walk_memory_range(PFN_DOWN(ent->start), + PFN_UP(ent->start + ent->size - 1), + NULL, online_mem_block); + } + + /* + * Memory was added successfully so clean up references to it + * so on reentry we can tell that this chunk was added. + */ + debugfs_remove_recursive(ent->dir); + pr_info("Added trace memory back to node %d\n", ent->nid); + ent->size = ent->start = ent->nid = -1; + } + if (ret) + return ret; + + /* If all chunks of memory were added successfully, reset globals */ + kfree(memtrace_array); + memtrace_array = NULL; + memtrace_size = 0; + memtrace_array_nr = 0; + return 0; +} + static int memtrace_enable_set(void *data, u64 val) { - if (memtrace_size) + u64 bytes; + + /* + * Don't attempt to do anything if size isn't aligned to a memory + * block or equal to zero. + */ + bytes = memory_block_size_bytes(); + if (val & (bytes - 1)) { + pr_err("Value must be aligned with 0x%llx\n", bytes); return -EINVAL; + } - if (!val) - return -EINVAL; + /* Re-add/online previously removed/offlined memory */ + if (memtrace_size) { + if (memtrace_online()) + return -EAGAIN; + } - /* Make sure size is aligned to a memory block */ - if (val & (memory_block_size_bytes() - 1)) - return -EINVAL; + if (!val) + return 0; + /* Offline and remove memory */ if (memtrace_init_regions_runtime(val)) return -EINVAL; diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 8cdf91f5d3a4..8006c54a91e3 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -17,7 +17,9 @@ #include <linux/pci.h> #include <linux/memblock.h> #include <linux/iommu.h> +#include <linux/debugfs.h> +#include <asm/debugfs.h> #include <asm/tlb.h> #include <asm/powernv.h> #include <asm/reg.h> @@ -44,7 +46,8 @@ static DEFINE_SPINLOCK(npu_context_lock); * entire TLB on the GPU for the given PID rather than each specific address in * the range. */ -#define ATSD_THRESHOLD (2*1024*1024) +static uint64_t atsd_threshold = 2 * 1024 * 1024; +static struct dentry *atsd_threshold_dentry; /* * Other types of TCE cache invalidation are not functional in the @@ -437,8 +440,9 @@ static int get_mmio_atsd_reg(struct npu *npu) int i; for (i = 0; i < npu->mmio_atsd_count; i++) { - if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage)) - return i; + if (!test_bit(i, &npu->mmio_atsd_usage)) + if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage)) + return i; } return -ENOSPC; @@ -683,7 +687,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct npu_context *npu_context = mn_to_npu_context(mn); unsigned long address; - if (end - start > ATSD_THRESHOLD) { + if (end - start > atsd_threshold) { /* * Just invalidate the entire PID if the address range is too * large. @@ -958,6 +962,11 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; + if (!atsd_threshold_dentry) { + atsd_threshold_dentry = debugfs_create_x64("atsd_threshold", + 0600, powerpc_debugfs_root, &atsd_threshold); + } + phb->npu.nmmu_flush = of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c index 0dc8fa4e0af2..198143833f00 100644 --- a/arch/powerpc/platforms/powernv/opal-dump.c +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -225,13 +225,16 @@ static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t * if (rc == OPAL_PARAMETER) rc = opal_dump_info(&id, &size); + if (rc) { + pr_warn("%s: Failed to get dump info (%d)\n", + __func__, rc); + return rc; + } + *dump_id = be32_to_cpu(id); *dump_size = be32_to_cpu(size); *dump_type = be32_to_cpu(type); - if (rc) - pr_warn("%s: Failed to get dump info (%d)\n", - __func__, rc); return rc; } @@ -368,13 +371,12 @@ static irqreturn_t process_dump(int irq, void *data) { int rc; uint32_t dump_id, dump_size, dump_type; - struct dump_obj *dump; char name[22]; struct kobject *kobj; rc = dump_read_info(&dump_id, &dump_size, &dump_type); if (rc != OPAL_SUCCESS) - return rc; + return IRQ_HANDLED; sprintf(name, "0x%x-0x%x", dump_type, dump_id); @@ -386,12 +388,10 @@ static irqreturn_t process_dump(int irq, void *data) if (kobj) { /* Drop reference added by kset_find_obj() */ kobject_put(kobj); - return 0; + return IRQ_HANDLED; } - dump = create_dump_obj(dump_id, dump_size, dump_type); - if (!dump) - return -1; + create_dump_obj(dump_id, dump_size, dump_type); return IRQ_HANDLED; } diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index 605c7e5d52c2..bc97770a67db 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -22,6 +22,7 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/of_irq.h> #include <asm/machdep.h> #include <asm/opal.h> @@ -38,8 +39,8 @@ struct opal_event_irqchip { }; static struct opal_event_irqchip opal_event_irqchip; static u64 last_outstanding_events; -static unsigned int opal_irq_count; -static unsigned int *opal_irqs; +static int opal_irq_count; +static struct resource *opal_irqs; void opal_handle_events(void) { @@ -165,24 +166,23 @@ void opal_event_shutdown(void) /* First free interrupts, which will also mask them */ for (i = 0; i < opal_irq_count; i++) { - if (!opal_irqs[i]) + if (!opal_irqs || !opal_irqs[i].start) continue; if (in_interrupt() || irqs_disabled()) - disable_irq_nosync(opal_irqs[i]); + disable_irq_nosync(opal_irqs[i].start); else - free_irq(opal_irqs[i], NULL); + free_irq(opal_irqs[i].start, NULL); - opal_irqs[i] = 0; + opal_irqs[i].start = 0; } } int __init opal_event_init(void) { struct device_node *dn, *opal_node; - const char **names; - u32 *irqs; - int i, rc; + bool old_style = false; + int i, rc = 0; opal_node = of_find_node_by_path("/ibm,opal"); if (!opal_node) { @@ -207,67 +207,91 @@ int __init opal_event_init(void) goto out; } - /* Get opal-interrupts property and names if present */ - rc = of_property_count_u32_elems(opal_node, "opal-interrupts"); - if (rc < 0) - goto out; + /* Look for new-style (standard) "interrupts" property */ + opal_irq_count = of_irq_count(opal_node); - opal_irq_count = rc; - pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count); + /* Absent ? Look for the old one */ + if (opal_irq_count < 1) { + /* Get opal-interrupts property and names if present */ + rc = of_property_count_u32_elems(opal_node, "opal-interrupts"); + if (rc > 0) + opal_irq_count = rc; + old_style = true; + } - irqs = kcalloc(opal_irq_count, sizeof(*irqs), GFP_KERNEL); - names = kcalloc(opal_irq_count, sizeof(*names), GFP_KERNEL); - opal_irqs = kcalloc(opal_irq_count, sizeof(*opal_irqs), GFP_KERNEL); + /* No interrupts ? Bail out */ + if (!opal_irq_count) + goto out; - if (WARN_ON(!irqs || !names || !opal_irqs)) - goto out_free; + pr_debug("OPAL: Found %d interrupts reserved for OPAL using %s scheme\n", + opal_irq_count, old_style ? "old" : "new"); - rc = of_property_read_u32_array(opal_node, "opal-interrupts", - irqs, opal_irq_count); - if (rc < 0) { - pr_err("Error %d reading opal-interrupts array\n", rc); - goto out_free; + /* Allocate an IRQ resources array */ + opal_irqs = kcalloc(opal_irq_count, sizeof(struct resource), GFP_KERNEL); + if (WARN_ON(!opal_irqs)) { + rc = -ENOMEM; + goto out; } - /* It's not an error for the names to be missing */ - of_property_read_string_array(opal_node, "opal-interrupts-names", - names, opal_irq_count); + /* Build the resources array */ + if (old_style) { + /* Old style "opal-interrupts" property */ + for (i = 0; i < opal_irq_count; i++) { + struct resource *r = &opal_irqs[i]; + const char *name = NULL; + u32 hw_irq; + int virq; + + rc = of_property_read_u32_index(opal_node, "opal-interrupts", + i, &hw_irq); + if (WARN_ON(rc < 0)) { + opal_irq_count = i; + break; + } + of_property_read_string_index(opal_node, "opal-interrupts-names", + i, &name); + virq = irq_create_mapping(NULL, hw_irq); + if (!virq) { + pr_warn("Failed to map OPAL irq 0x%x\n", hw_irq); + continue; + } + r->start = r->end = virq; + r->flags = IORESOURCE_IRQ | IRQ_TYPE_LEVEL_LOW; + r->name = name; + } + } else { + /* new style standard "interrupts" property */ + rc = of_irq_to_resource_table(opal_node, opal_irqs, opal_irq_count); + if (WARN_ON(rc < 0)) { + opal_irq_count = 0; + kfree(opal_irqs); + goto out; + } + if (WARN_ON(rc < opal_irq_count)) + opal_irq_count = rc; + } /* Install interrupt handlers */ for (i = 0; i < opal_irq_count; i++) { - unsigned int virq; - char *name; - - /* Get hardware and virtual IRQ */ - virq = irq_create_mapping(NULL, irqs[i]); - if (!virq) { - pr_warn("Failed to map irq 0x%x\n", irqs[i]); - continue; - } + struct resource *r = &opal_irqs[i]; + const char *name; - if (names[i] && strlen(names[i])) - name = kasprintf(GFP_KERNEL, "opal-%s", names[i]); + /* Prefix name */ + if (r->name && strlen(r->name)) + name = kasprintf(GFP_KERNEL, "opal-%s", r->name); else name = kasprintf(GFP_KERNEL, "opal"); /* Install interrupt handler */ - rc = request_irq(virq, opal_interrupt, IRQF_TRIGGER_LOW, + rc = request_irq(r->start, opal_interrupt, r->flags & IRQD_TRIGGER_MASK, name, NULL); if (rc) { - irq_dispose_mapping(virq); - pr_warn("Error %d requesting irq %d (0x%x)\n", - rc, virq, irqs[i]); + pr_warn("Error %d requesting OPAL irq %d\n", rc, (int)r->start); continue; } - - /* Cache IRQ */ - opal_irqs[i] = virq; } - -out_free: - kfree(irqs); - kfree(names); -out: + rc = 0; + out: of_node_put(opal_node); return rc; } diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c index 6f1214d4de92..55691950d981 100644 --- a/arch/powerpc/platforms/powernv/opal-kmsg.c +++ b/arch/powerpc/platforms/powernv/opal-kmsg.c @@ -23,12 +23,9 @@ * may not be completely printed. This function does not actually dump the * message, it just ensures that OPAL completely flushes the console buffer. */ -static void force_opal_console_flush(struct kmsg_dumper *dumper, +static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { - int i; - int64_t ret; - /* * Outside of a panic context the pollers will continue to run, * so we don't need to do any special flushing. @@ -36,32 +33,11 @@ static void force_opal_console_flush(struct kmsg_dumper *dumper, if (reason != KMSG_DUMP_PANIC) return; - if (opal_check_token(OPAL_CONSOLE_FLUSH)) { - ret = opal_console_flush(0); - - if (ret == OPAL_UNSUPPORTED || ret == OPAL_PARAMETER) - return; - - /* Incrementally flush until there's nothing left */ - while (opal_console_flush(0) != OPAL_SUCCESS); - } else { - /* - * If OPAL_CONSOLE_FLUSH is not implemented in the firmware, - * the console can still be flushed by calling the polling - * function enough times to flush the buffer. We don't know - * how much output still needs to be flushed, but we can be - * generous since the kernel is in panic and doesn't need - * to do much else. - */ - printk(KERN_NOTICE "opal: OPAL_CONSOLE_FLUSH missing.\n"); - for (i = 0; i < 1024; i++) { - opal_poll_events(NULL); - } - } + opal_flush_console(0); } static struct kmsg_dumper opal_kmsg_dumper = { - .dump = force_opal_console_flush + .dump = kmsg_dump_opal_console_flush }; void __init opal_kmsg_init(void) diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c index 541c9ea04a32..f7d04b6a2d7a 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -32,6 +32,34 @@ static struct sensor_group { struct sg_attr *sgattrs; } *sgs; +int sensor_group_enable(u32 handle, bool enable) +{ + struct opal_msg msg; + int token, ret; + + token = opal_async_get_token_interruptible(); + if (token < 0) + return token; + + ret = opal_sensor_group_enable(handle, token, enable); + if (ret == OPAL_ASYNC_COMPLETION) { + ret = opal_async_wait_response(token, &msg); + if (ret) { + pr_devel("Failed to wait for the async response\n"); + ret = -EIO; + goto out; + } + ret = opal_error_code(opal_get_async_rc(msg)); + } else { + ret = opal_error_code(ret); + } + +out: + opal_async_release_token(token); + return ret; +} +EXPORT_SYMBOL_GPL(sensor_group_enable); + static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index a8d9b4089c31..251528231a9e 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -14,6 +14,8 @@ #include <asm/hvcall.h> #include <asm/asm-offsets.h> #include <asm/opal.h> +#include <asm/asm-compat.h> +#include <asm/feature-fixups.h> .section ".text" @@ -327,3 +329,5 @@ OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); +OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); +OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 0d539c661748..404c379db168 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -344,70 +344,125 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count) return 0; } -int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic) { - int written = 0; + unsigned long flags = 0 /* shut up gcc */; + int written; __be64 olen; - s64 len, rc; - unsigned long flags; - __be64 evt; + s64 rc; if (!opal.entry) return -ENODEV; - /* We want put_chars to be atomic to avoid mangling of hvsi - * packets. To do that, we first test for room and return - * -EAGAIN if there isn't enough. - * - * Unfortunately, opal_console_write_buffer_space() doesn't - * appear to work on opal v1, so we just assume there is - * enough room and be done with it - */ - spin_lock_irqsave(&opal_write_lock, flags); + if (atomic) + spin_lock_irqsave(&opal_write_lock, flags); rc = opal_console_write_buffer_space(vtermno, &olen); - len = be64_to_cpu(olen); - if (rc || len < total_len) { - spin_unlock_irqrestore(&opal_write_lock, flags); + if (rc || be64_to_cpu(olen) < total_len) { /* Closed -> drop characters */ if (rc) - return total_len; - opal_poll_events(NULL); - return -EAGAIN; + written = total_len; + else + written = -EAGAIN; + goto out; } - /* We still try to handle partial completions, though they - * should no longer happen. - */ - rc = OPAL_BUSY; - while(total_len > 0 && (rc == OPAL_BUSY || - rc == OPAL_BUSY_EVENT || rc == OPAL_SUCCESS)) { - olen = cpu_to_be64(total_len); - rc = opal_console_write(vtermno, &olen, data); - len = be64_to_cpu(olen); - - /* Closed or other error drop */ - if (rc != OPAL_SUCCESS && rc != OPAL_BUSY && - rc != OPAL_BUSY_EVENT) { - written = total_len; - break; + /* Should not get a partial write here because space is available. */ + olen = cpu_to_be64(total_len); + rc = opal_console_write(vtermno, &olen, data); + if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + if (rc == OPAL_BUSY_EVENT) { + mdelay(OPAL_BUSY_DELAY_MS); + opal_poll_events(NULL); + } else if (rc == OPAL_BUSY_EVENT) { + mdelay(OPAL_BUSY_DELAY_MS); } - if (rc == OPAL_SUCCESS) { - total_len -= len; - data += len; - written += len; + written = -EAGAIN; + goto out; + } + + /* Closed or other error drop */ + if (rc != OPAL_SUCCESS) { + written = opal_error_code(rc); + goto out; + } + + written = be64_to_cpu(olen); + if (written < total_len) { + if (atomic) { + /* Should not happen */ + pr_warn("atomic console write returned partial " + "len=%d written=%d\n", total_len, written); } - /* This is a bit nasty but we need that for the console to - * flush when there aren't any interrupts. We will clean - * things a bit later to limit that to synchronous path - * such as the kernel console and xmon/udbg + if (!written) + written = -EAGAIN; + } + +out: + if (atomic) + spin_unlock_irqrestore(&opal_write_lock, flags); + + /* In the -EAGAIN case, callers loop, so we have to flush the console + * here in case they have interrupts off (and we don't want to wait + * for async flushing if we can make immediate progress here). If + * necessary the API could be made entirely non-flushing if the + * callers had a ->flush API to use. + */ + if (written == -EAGAIN) + opal_flush_console(vtermno); + + return written; +} + +int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +{ + return __opal_put_chars(vtermno, data, total_len, false); +} + +/* + * opal_put_chars_atomic will not perform partial-writes. Data will be + * atomically written to the terminal or not at all. This is not strictly + * true at the moment because console space can race with OPAL's console + * writes. + */ +int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len) +{ + return __opal_put_chars(vtermno, data, total_len, true); +} + +int opal_flush_console(uint32_t vtermno) +{ + s64 rc; + + if (!opal_check_token(OPAL_CONSOLE_FLUSH)) { + __be64 evt; + + WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n"); + /* + * If OPAL_CONSOLE_FLUSH is not implemented in the firmware, + * the console can still be flushed by calling the polling + * function while it has OPAL_EVENT_CONSOLE_OUTPUT events. */ - do + do { opal_poll_events(&evt); - while(rc == OPAL_SUCCESS && - (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT)); + } while (be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT); + + return OPAL_SUCCESS; } - spin_unlock_irqrestore(&opal_write_lock, flags); - return written; + + do { + rc = OPAL_BUSY; + while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { + rc = opal_console_flush(vtermno); + if (rc == OPAL_BUSY_EVENT) { + mdelay(OPAL_BUSY_DELAY_MS); + opal_poll_events(NULL); + } else if (rc == OPAL_BUSY) { + mdelay(OPAL_BUSY_DELAY_MS); + } + } + } while (rc == OPAL_PARTIAL); /* More to flush */ + + return opal_error_code(rc); } static int opal_recover_mce(struct pt_regs *regs, @@ -922,6 +977,7 @@ EXPORT_SYMBOL_GPL(opal_flash_read); EXPORT_SYMBOL_GPL(opal_flash_write); EXPORT_SYMBOL_GPL(opal_flash_erase); EXPORT_SYMBOL_GPL(opal_prd_msg); +EXPORT_SYMBOL_GPL(opal_check_token); /* Convert a region of vmalloc memory to an opal sg list */ struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, @@ -1034,3 +1090,5 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async); EXPORT_SYMBOL_GPL(opal_int_set_mfrr); EXPORT_SYMBOL_GPL(opal_int_eoi); EXPORT_SYMBOL_GPL(opal_error_code); +/* Export the below symbol for NX compression */ +EXPORT_SYMBOL(opal_nx_coproc_init); diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c index cee003de63af..1b18111453d7 100644 --- a/arch/powerpc/platforms/powernv/pci-cxl.c +++ b/arch/powerpc/platforms/powernv/pci-cxl.c @@ -8,11 +8,8 @@ */ #include <linux/module.h> -#include <linux/msi.h> -#include <asm/pci-bridge.h> #include <asm/pnv-pci.h> #include <asm/opal.h> -#include <misc/cxl.h> #include "pci.h" @@ -179,199 +176,3 @@ static inline int get_cxl_module(void) #else static inline int get_cxl_module(void) { return 0; } #endif - -/* - * Sets flags and switches the controller ops to enable the cxl kernel api. - * Originally the cxl kernel API operated on a virtual PHB, but certain cards - * such as the Mellanox CX4 use a peer model instead and for these cards the - * cxl kernel api will operate on the real PHB. - */ -int pnv_cxl_enable_phb_kernel_api(struct pci_controller *hose, bool enable) -{ - struct pnv_phb *phb = hose->private_data; - int rc; - - if (!enable) { - /* - * Once cxl mode is enabled on the PHB, there is currently no - * known safe method to disable it again, and trying risks a - * checkstop. If we can find a way to safely disable cxl mode - * in the future we can revisit this, but for now the only sane - * thing to do is to refuse to disable cxl mode: - */ - return -EPERM; - } - - /* - * Hold a reference to the cxl module since several PHB operations now - * depend on it, and it would be insane to allow it to be removed so - * long as we are in this mode (and since we can't safely disable this - * mode once enabled...). - */ - rc = get_cxl_module(); - if (rc) - return rc; - - phb->flags |= PNV_PHB_FLAG_CXL; - hose->controller_ops = pnv_cxl_cx4_ioda_controller_ops; - - return 0; -} -EXPORT_SYMBOL_GPL(pnv_cxl_enable_phb_kernel_api); - -bool pnv_pci_on_cxl_phb(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - - return !!(phb->flags & PNV_PHB_FLAG_CXL); -} -EXPORT_SYMBOL_GPL(pnv_pci_on_cxl_phb); - -struct cxl_afu *pnv_cxl_phb_to_afu(struct pci_controller *hose) -{ - struct pnv_phb *phb = hose->private_data; - - return (struct cxl_afu *)phb->cxl_afu; -} -EXPORT_SYMBOL_GPL(pnv_cxl_phb_to_afu); - -void pnv_cxl_phb_set_peer_afu(struct pci_dev *dev, struct cxl_afu *afu) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - - phb->cxl_afu = afu; -} -EXPORT_SYMBOL_GPL(pnv_cxl_phb_set_peer_afu); - -/* - * In the peer cxl model, the XSL/PSL is physical function 0, and will be used - * by other functions on the device for memory access and interrupts. When the - * other functions are enabled we explicitly take a reference on the cxl - * function since they will use it, and allocate a default context associated - * with that function just like the vPHB model of the cxl kernel API. - */ -bool pnv_cxl_enable_device_hook(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - struct cxl_afu *afu = phb->cxl_afu; - - if (!pnv_pci_enable_device_hook(dev)) - return false; - - - /* No special handling for the cxl function, which is always PF 0 */ - if (PCI_FUNC(dev->devfn) == 0) - return true; - - if (!afu) { - dev_WARN(&dev->dev, "Attempted to enable function > 0 on CXL PHB without a peer AFU\n"); - return false; - } - - dev_info(&dev->dev, "Enabling function on CXL enabled PHB with peer AFU\n"); - - /* Make sure the peer AFU can't go away while this device is active */ - cxl_afu_get(afu); - - return cxl_pci_associate_default_context(dev, afu); -} - -void pnv_cxl_disable_device(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - struct cxl_afu *afu = phb->cxl_afu; - - /* No special handling for cxl function: */ - if (PCI_FUNC(dev->devfn) == 0) - return; - - cxl_pci_disable_device(dev); - cxl_afu_put(afu); -} - -/* - * This is a special version of pnv_setup_msi_irqs for cards in cxl mode. This - * function handles setting up the IVTE entries for the XSL to use. - * - * We are currently not filling out the MSIX table, since the only currently - * supported adapter (CX4) uses a custom MSIX table format in cxl mode and it - * is up to their driver to fill that out. In the future we may fill out the - * MSIX table (and change the IVTE entries to be an index to the MSIX table) - * for adapters implementing the Full MSI-X mode described in the CAIA. - */ -int pnv_cxl_cx4_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct msi_desc *entry; - struct cxl_context *ctx = NULL; - unsigned int virq; - int hwirq; - int afu_irq = 0; - int rc; - - if (WARN_ON(!phb) || !phb->msi_bmp.bitmap) - return -ENODEV; - - if (pdev->no_64bit_msi && !phb->msi32_support) - return -ENODEV; - - rc = cxl_cx4_setup_msi_irqs(pdev, nvec, type); - if (rc) - return rc; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->msi_attrib.is_64 && !phb->msi32_support) { - pr_warn("%s: Supports only 64-bit MSIs\n", - pci_name(pdev)); - return -ENXIO; - } - - hwirq = cxl_next_msi_hwirq(pdev, &ctx, &afu_irq); - if (WARN_ON(hwirq <= 0)) - return (hwirq ? hwirq : -ENOMEM); - - virq = irq_create_mapping(NULL, hwirq); - if (!virq) { - pr_warn("%s: Failed to map cxl mode MSI to linux irq\n", - pci_name(pdev)); - return -ENOMEM; - } - - rc = pnv_cxl_ioda_msi_setup(pdev, hwirq, virq); - if (rc) { - pr_warn("%s: Failed to setup cxl mode MSI\n", pci_name(pdev)); - irq_dispose_mapping(virq); - return rc; - } - - irq_set_msi_desc(virq, entry); - } - - return 0; -} - -void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct msi_desc *entry; - irq_hw_number_t hwirq; - - if (WARN_ON(!phb)) - return; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - hwirq = virq_to_hw(entry->irq); - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - } - - cxl_cx4_teardown_msi_irqs(pdev); -} diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c new file mode 100644 index 000000000000..6c5db1acbe8d --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * TCE helpers for IODA PCI/PCIe on PowerNV platforms + * + * Copyright 2018 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/iommu.h> + +#include <asm/iommu.h> +#include <asm/tce.h> +#include "pci.h" + +void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, unsigned int page_shift) +{ + tbl->it_blocksize = 16; + tbl->it_base = (unsigned long)tce_mem; + tbl->it_page_shift = page_shift; + tbl->it_offset = dma_offset >> tbl->it_page_shift; + tbl->it_index = 0; + tbl->it_size = tce_size >> 3; + tbl->it_busno = 0; + tbl->it_type = TCE_PCI; +} + +static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) +{ + struct page *tce_mem = NULL; + __be64 *addr; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + if (!tce_mem) { + pr_err("Failed to allocate a TCE memory, level shift=%d\n", + shift); + return NULL; + } + addr = page_address(tce_mem); + memset(addr, 0, 1UL << shift); + + return addr; +} + +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) +{ + __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; + int level = tbl->it_indirect_levels; + const long shift = ilog2(tbl->it_level_size); + unsigned long mask = (tbl->it_level_size - 1) << (level * shift); + + while (level) { + int n = (idx & mask) >> (level * shift); + unsigned long tce; + + if (tmp[n] == 0) { + __be64 *tmp2; + + if (!alloc) + return NULL; + + tmp2 = pnv_alloc_tce_level(tbl->it_nid, + ilog2(tbl->it_level_size) + 3); + if (!tmp2) + return NULL; + + tmp[n] = cpu_to_be64(__pa(tmp2) | + TCE_PCI_READ | TCE_PCI_WRITE); + } + tce = be64_to_cpu(tmp[n]); + + tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); + idx &= ~mask; + mask >>= shift; + --level; + } + + return tmp + idx; +} + +int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + unsigned long attrs) +{ + u64 proto_tce = iommu_direction_to_tce_perm(direction); + u64 rpn = __pa(uaddr) >> tbl->it_page_shift; + long i; + + if (proto_tce & TCE_PCI_WRITE) + proto_tce |= TCE_PCI_READ; + + for (i = 0; i < npages; i++) { + unsigned long newtce = proto_tce | + ((rpn + i) << tbl->it_page_shift); + unsigned long idx = index - tbl->it_offset + i; + + *(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce); + } + + return 0; +} + +#ifdef CONFIG_IOMMU_API +int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc) +{ + u64 proto_tce = iommu_direction_to_tce_perm(*direction); + unsigned long newtce = *hpa | proto_tce, oldtce; + unsigned long idx = index - tbl->it_offset; + __be64 *ptce = NULL; + + BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + + if (*direction == DMA_NONE) { + ptce = pnv_tce(tbl, false, idx, false); + if (!ptce) { + *hpa = 0; + return 0; + } + } + + if (!ptce) { + ptce = pnv_tce(tbl, false, idx, alloc); + if (!ptce) + return alloc ? H_HARDWARE : H_TOO_HARD; + } + + if (newtce & TCE_PCI_WRITE) + newtce |= TCE_PCI_READ; + + oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce))); + *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); + *direction = iommu_tce_direction(oldtce); + + return 0; +} + +__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc) +{ + if (WARN_ON_ONCE(!tbl->it_userspace)) + return NULL; + + return pnv_tce(tbl, true, index - tbl->it_offset, alloc); +} +#endif + +void pnv_tce_free(struct iommu_table *tbl, long index, long npages) +{ + long i; + + for (i = 0; i < npages; i++) { + unsigned long idx = index - tbl->it_offset + i; + __be64 *ptce = pnv_tce(tbl, false, idx, false); + + if (ptce) + *ptce = cpu_to_be64(0); + } +} + +unsigned long pnv_tce_get(struct iommu_table *tbl, long index) +{ + __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false); + + if (!ptce) + return 0; + + return be64_to_cpu(*ptce); +} + +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned int levels) +{ + const unsigned long addr_ul = (unsigned long) addr & + ~(TCE_PCI_READ | TCE_PCI_WRITE); + + if (levels) { + long i; + u64 *tmp = (u64 *) addr_ul; + + for (i = 0; i < size; ++i) { + unsigned long hpa = be64_to_cpu(tmp[i]); + + if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) + continue; + + pnv_pci_ioda2_table_do_free_pages(__va(hpa), size, + levels - 1); + } + } + + free_pages(addr_ul, get_order(size << 3)); +} + +void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) +{ + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + + if (!tbl->it_size) + return; + + pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, + tbl->it_indirect_levels); + if (tbl->it_userspace) { + pnv_pci_ioda2_table_do_free_pages(tbl->it_userspace, size, + tbl->it_indirect_levels); + } +} + +static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, + unsigned int levels, unsigned long limit, + unsigned long *current_offset, unsigned long *total_allocated) +{ + __be64 *addr, *tmp; + unsigned long allocated = 1UL << shift; + unsigned int entries = 1UL << (shift - 3); + long i; + + addr = pnv_alloc_tce_level(nid, shift); + *total_allocated += allocated; + + --levels; + if (!levels) { + *current_offset += allocated; + return addr; + } + + for (i = 0; i < entries; ++i) { + tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, + levels, limit, current_offset, total_allocated); + if (!tmp) + break; + + addr[i] = cpu_to_be64(__pa(tmp) | + TCE_PCI_READ | TCE_PCI_WRITE); + + if (*current_offset >= limit) + break; + } + + return addr; +} + +long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + bool alloc_userspace_copy, struct iommu_table *tbl) +{ + void *addr, *uas = NULL; + unsigned long offset = 0, level_shift, total_allocated = 0; + unsigned long total_allocated_uas = 0; + const unsigned int window_shift = ilog2(window_size); + unsigned int entries_shift = window_shift - page_shift; + unsigned int table_shift = max_t(unsigned int, entries_shift + 3, + PAGE_SHIFT); + const unsigned long tce_table_size = 1UL << table_shift; + unsigned int tmplevels = levels; + + if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) + return -EINVAL; + + if (!is_power_of_2(window_size)) + return -EINVAL; + + if (alloc_userspace_copy && (window_size > (1ULL << 32))) + tmplevels = 1; + + /* Adjust direct table size from window_size and levels */ + entries_shift = (entries_shift + levels - 1) / levels; + level_shift = entries_shift + 3; + level_shift = max_t(unsigned int, level_shift, PAGE_SHIFT); + + if ((level_shift - 3) * levels + page_shift >= 60) + return -EINVAL; + + /* Allocate TCE table */ + addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, + tmplevels, tce_table_size, &offset, &total_allocated); + + /* addr==NULL means that the first level allocation failed */ + if (!addr) + return -ENOMEM; + + /* + * First level was allocated but some lower level failed as + * we did not allocate as much as we wanted, + * release partially allocated table. + */ + if (tmplevels == levels && offset < tce_table_size) + goto free_tces_exit; + + /* Allocate userspace view of the TCE table */ + if (alloc_userspace_copy) { + offset = 0; + uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, + levels, tce_table_size, &offset, + &total_allocated_uas); + if (!uas) + goto free_tces_exit; + if (tmplevels == levels && (offset < tce_table_size || + total_allocated_uas != total_allocated)) + goto free_uas_exit; + } + + /* Setup linux iommu table */ + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, + page_shift); + tbl->it_level_size = 1ULL << (level_shift - 3); + tbl->it_indirect_levels = levels - 1; + tbl->it_allocated_size = total_allocated; + tbl->it_userspace = uas; + tbl->it_nid = nid; + + pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", + window_size, tce_table_size, bus_offset, tbl->it_base, + tbl->it_userspace, tmplevels, levels); + + return 0; + +free_uas_exit: + pnv_pci_ioda2_table_do_free_pages(uas, + 1ULL << (level_shift - 3), levels - 1); +free_tces_exit: + pnv_pci_ioda2_table_do_free_pages(addr, + 1ULL << (level_shift - 3), levels - 1); + + return -ENOMEM; +} + +static void pnv_iommu_table_group_link_free(struct rcu_head *head) +{ + struct iommu_table_group_link *tgl = container_of(head, + struct iommu_table_group_link, rcu); + + kfree(tgl); +} + +void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + long i; + bool found; + struct iommu_table_group_link *tgl; + + if (!tbl || !table_group) + return; + + /* Remove link to a group from table's list of attached groups */ + found = false; + list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + if (tgl->table_group == table_group) { + list_del_rcu(&tgl->next); + call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); + found = true; + break; + } + } + if (WARN_ON(!found)) + return; + + /* Clean a pointer to iommu_table in iommu_table_group::tables[] */ + found = false; + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] == tbl) { + table_group->tables[i] = NULL; + found = true; + break; + } + } + WARN_ON(!found); +} + +long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group) +{ + struct iommu_table_group_link *tgl = NULL; + + if (WARN_ON(!tbl || !table_group)) + return -EINVAL; + + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + return -ENOMEM; + + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + + table_group->tables[num] = tbl; + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5bd0eb6681bc..4e6302bf4073 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -46,17 +46,14 @@ #include "powernv.h" #include "pci.h" +#include "../../../../drivers/pci/pci.h" #define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ #define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ #define PNV_IODA1_DMA32_SEGSIZE 0x10000000 -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 -#define POWERNV_IOMMU_MAX_LEVELS 5 - static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK", "NPU_OCAPI" }; -static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) @@ -2007,7 +2004,7 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); if (!ret) pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); @@ -2018,7 +2015,7 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); if (!ret) pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); @@ -2040,6 +2037,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda1_tce_xchg, .exchange_rm = pnv_ioda1_tce_xchg_rm, + .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, @@ -2171,7 +2169,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); @@ -2182,7 +2180,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); @@ -2199,20 +2197,16 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); } -static void pnv_ioda2_table_free(struct iommu_table *tbl) -{ - pnv_pci_ioda2_table_free_pages(tbl); -} - static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda2_tce_xchg, .exchange_rm = pnv_ioda2_tce_xchg_rm, + .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, - .free = pnv_ioda2_table_free, + .free = pnv_pci_ioda2_table_free_pages, }; static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) @@ -2462,13 +2456,9 @@ void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) pe->tce_bypass_enabled = enable; } -static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, - __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl); - static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, int num, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table **ptbl) + bool alloc_userspace_copy, struct iommu_table **ptbl) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); @@ -2485,7 +2475,7 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, ret = pnv_pci_ioda2_table_alloc_pages(nid, bus_offset, page_shift, window_size, - levels, tbl); + levels, alloc_userspace_copy, tbl); if (ret) { iommu_tce_table_put(tbl); return ret; @@ -2518,7 +2508,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, IOMMU_PAGE_SHIFT_4K, window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); + POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); @@ -2605,7 +2595,16 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, tce_table_size, direct_table_size); } - return bytes; + return bytes + bytes; /* one for HW table, one for userspace copy */ +} + +static long pnv_pci_ioda2_create_table_userspace( + struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + return pnv_pci_ioda2_create_table(table_group, + num, page_shift, window_size, levels, true, ptbl); } static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) @@ -2634,7 +2633,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, + .create_table = pnv_pci_ioda2_create_table_userspace, .set_window = pnv_pci_ioda2_set_window, .unset_window = pnv_pci_ioda2_unset_window, .take_ownership = pnv_ioda2_take_ownership, @@ -2739,7 +2738,7 @@ static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, + .create_table = pnv_pci_ioda2_create_table_userspace, .set_window = pnv_pci_ioda2_npu_set_window, .unset_window = pnv_pci_ioda2_npu_unset_window, .take_ownership = pnv_ioda2_npu_take_ownership, @@ -2773,144 +2772,6 @@ static void pnv_pci_ioda_setup_iommu_api(void) static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif -static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, - unsigned levels, unsigned long limit, - unsigned long *current_offset, unsigned long *total_allocated) -{ - struct page *tce_mem = NULL; - __be64 *addr, *tmp; - unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; - unsigned long allocated = 1UL << (order + PAGE_SHIFT); - unsigned entries = 1UL << (shift - 3); - long i; - - tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); - if (!tce_mem) { - pr_err("Failed to allocate a TCE memory, order=%d\n", order); - return NULL; - } - addr = page_address(tce_mem); - memset(addr, 0, allocated); - *total_allocated += allocated; - - --levels; - if (!levels) { - *current_offset += allocated; - return addr; - } - - for (i = 0; i < entries; ++i) { - tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, - levels, limit, current_offset, total_allocated); - if (!tmp) - break; - - addr[i] = cpu_to_be64(__pa(tmp) | - TCE_PCI_READ | TCE_PCI_WRITE); - - if (*current_offset >= limit) - break; - } - - return addr; -} - -static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, - unsigned long size, unsigned level); - -static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, - __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl) -{ - void *addr; - unsigned long offset = 0, level_shift, total_allocated = 0; - const unsigned window_shift = ilog2(window_size); - unsigned entries_shift = window_shift - page_shift; - unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT); - const unsigned long tce_table_size = 1UL << table_shift; - - if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) - return -EINVAL; - - if (!is_power_of_2(window_size)) - return -EINVAL; - - /* Adjust direct table size from window_size and levels */ - entries_shift = (entries_shift + levels - 1) / levels; - level_shift = entries_shift + 3; - level_shift = max_t(unsigned, level_shift, PAGE_SHIFT); - - if ((level_shift - 3) * levels + page_shift >= 60) - return -EINVAL; - - /* Allocate TCE table */ - addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset, &total_allocated); - - /* addr==NULL means that the first level allocation failed */ - if (!addr) - return -ENOMEM; - - /* - * First level was allocated but some lower level failed as - * we did not allocate as much as we wanted, - * release partially allocated table. - */ - if (offset < tce_table_size) { - pnv_pci_ioda2_table_do_free_pages(addr, - 1ULL << (level_shift - 3), levels - 1); - return -ENOMEM; - } - - /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, - page_shift); - tbl->it_level_size = 1ULL << (level_shift - 3); - tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = total_allocated; - - pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", - window_size, tce_table_size, bus_offset); - - return 0; -} - -static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, - unsigned long size, unsigned level) -{ - const unsigned long addr_ul = (unsigned long) addr & - ~(TCE_PCI_READ | TCE_PCI_WRITE); - - if (level) { - long i; - u64 *tmp = (u64 *) addr_ul; - - for (i = 0; i < size; ++i) { - unsigned long hpa = be64_to_cpu(tmp[i]); - - if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) - continue; - - pnv_pci_ioda2_table_do_free_pages(__va(hpa), size, - level - 1); - } - } - - free_pages(addr_ul, get_order(size << 3)); -} - -static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) -{ - const unsigned long size = tbl->it_indirect_levels ? - tbl->it_level_size : tbl->it_size; - - if (!tbl->it_size) - return; - - pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, - tbl->it_indirect_levels); -} - static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) { struct pci_controller *hose = phb->hose; @@ -2925,7 +2786,7 @@ static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) /* Add 16M for POWER8 by default */ if (cpu_has_feature(CPU_FTR_ARCH_207S) && !cpu_has_feature(CPU_FTR_ARCH_300)) - mask |= SZ_16M; + mask |= SZ_16M | SZ_256M; return mask; } @@ -3138,7 +2999,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) struct pci_dn *pdn; int mul, total_vfs; - if (!pdev->is_physfn || pdev->is_added) + if (!pdev->is_physfn || pci_dev_is_added(pdev)) return; pdn = pci_get_pdn(pdev); @@ -3575,7 +3436,7 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, /* Prevent enabling devices for which we couldn't properly * assign a PE */ -bool pnv_pci_enable_device_hook(struct pci_dev *dev) +static bool pnv_pci_enable_device_hook(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); struct pnv_phb *phb = hose->private_data; @@ -3843,26 +3704,6 @@ static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { .shutdown = pnv_pci_ioda_shutdown, }; -#ifdef CONFIG_CXL_BASE -const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, -#ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_cxl_cx4_setup_msi_irqs, - .teardown_msi_irqs = pnv_cxl_cx4_teardown_msi_irqs, -#endif - .enable_device_hook = pnv_cxl_enable_device_hook, - .disable_device = pnv_cxl_disable_device, - .release_device = pnv_pci_release_device, - .window_alignment = pnv_pci_window_alignment, - .setup_bridge = pnv_pci_setup_bridge, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, - .shutdown = pnv_pci_ioda_shutdown, -}; -#endif - static void __init pnv_pci_init_ioda_phb(struct device_node *np, u64 hub_id, int ioda_type) { diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index b265ecc0836a..13aef2323bbc 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -802,85 +802,6 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; -static __be64 *pnv_tce(struct iommu_table *tbl, long idx) -{ - __be64 *tmp = ((__be64 *)tbl->it_base); - int level = tbl->it_indirect_levels; - const long shift = ilog2(tbl->it_level_size); - unsigned long mask = (tbl->it_level_size - 1) << (level * shift); - - while (level) { - int n = (idx & mask) >> (level * shift); - unsigned long tce = be64_to_cpu(tmp[n]); - - tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); - idx &= ~mask; - mask >>= shift; - --level; - } - - return tmp + idx; -} - -int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs) -{ - u64 proto_tce = iommu_direction_to_tce_perm(direction); - u64 rpn = __pa(uaddr) >> tbl->it_page_shift; - long i; - - if (proto_tce & TCE_PCI_WRITE) - proto_tce |= TCE_PCI_READ; - - for (i = 0; i < npages; i++) { - unsigned long newtce = proto_tce | - ((rpn + i) << tbl->it_page_shift); - unsigned long idx = index - tbl->it_offset + i; - - *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce); - } - - return 0; -} - -#ifdef CONFIG_IOMMU_API -int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - u64 proto_tce = iommu_direction_to_tce_perm(*direction); - unsigned long newtce = *hpa | proto_tce, oldtce; - unsigned long idx = index - tbl->it_offset; - - BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); - - if (newtce & TCE_PCI_WRITE) - newtce |= TCE_PCI_READ; - - oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce))); - *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); - *direction = iommu_tce_direction(oldtce); - - return 0; -} -#endif - -void pnv_tce_free(struct iommu_table *tbl, long index, long npages) -{ - long i; - - for (i = 0; i < npages; i++) { - unsigned long idx = index - tbl->it_offset + i; - - *(pnv_tce(tbl, idx)) = cpu_to_be64(0); - } -} - -unsigned long pnv_tce_get(struct iommu_table *tbl, long index) -{ - return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); -} - struct iommu_table *pnv_pci_table_alloc(int nid) { struct iommu_table *tbl; @@ -895,85 +816,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) return tbl; } -long pnv_pci_link_table_and_group(int node, int num, - struct iommu_table *tbl, - struct iommu_table_group *table_group) -{ - struct iommu_table_group_link *tgl = NULL; - - if (WARN_ON(!tbl || !table_group)) - return -EINVAL; - - tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, - node); - if (!tgl) - return -ENOMEM; - - tgl->table_group = table_group; - list_add_rcu(&tgl->next, &tbl->it_group_list); - - table_group->tables[num] = tbl; - - return 0; -} - -static void pnv_iommu_table_group_link_free(struct rcu_head *head) -{ - struct iommu_table_group_link *tgl = container_of(head, - struct iommu_table_group_link, rcu); - - kfree(tgl); -} - -void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, - struct iommu_table_group *table_group) -{ - long i; - bool found; - struct iommu_table_group_link *tgl; - - if (!tbl || !table_group) - return; - - /* Remove link to a group from table's list of attached groups */ - found = false; - list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { - if (tgl->table_group == table_group) { - list_del_rcu(&tgl->next); - call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); - found = true; - break; - } - } - if (WARN_ON(!found)) - return; - - /* Clean a pointer to iommu_table in iommu_table_group::tables[] */ - found = false; - for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { - if (table_group->tables[i] == tbl) { - table_group->tables[i] = NULL; - found = true; - break; - } - } - WARN_ON(!found); -} - -void pnv_pci_setup_iommu_table(struct iommu_table *tbl, - void *tce_mem, u64 tce_size, - u64 dma_offset, unsigned page_shift) -{ - tbl->it_blocksize = 16; - tbl->it_base = (unsigned long)tce_mem; - tbl->it_page_shift = page_shift; - tbl->it_offset = dma_offset >> tbl->it_page_shift; - tbl->it_index = 0; - tbl->it_size = tce_size >> 3; - tbl->it_busno = 0; - tbl->it_type = TCE_PCI; -} - void pnv_pci_dma_dev_setup(struct pci_dev *pdev) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index eada4b6068cb..8b37b28e3831 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -88,7 +88,6 @@ struct pnv_ioda_pe { }; #define PNV_PHB_FLAG_EEH (1 << 0) -#define PNV_PHB_FLAG_CXL (1 << 1) /* Real PHB supporting the cxl kernel API */ struct pnv_phb { struct pci_controller *hose; @@ -194,20 +193,10 @@ struct pnv_phb { bool nmmu_flush; } npu; -#ifdef CONFIG_CXL_BASE - struct cxl_afu *cxl_afu; -#endif int p2p_target_count; }; extern struct pci_ops pnv_pci_ops; -extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs); -extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); -extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, unsigned char *log_buff); @@ -217,14 +206,6 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); extern struct iommu_table *pnv_pci_table_alloc(int nid); -extern long pnv_pci_link_table_and_group(int node, int num, - struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, - void *tce_mem, u64 tce_size, - u64 dma_offset, unsigned page_shift); extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_init_npu_phb(struct device_node *np); @@ -238,7 +219,6 @@ extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); -extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); extern int pnv_eeh_post_init(void); @@ -262,14 +242,33 @@ extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); extern int pnv_npu2_init(struct pnv_phb *phb); -/* cxl functions */ -extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev); -extern void pnv_cxl_disable_device(struct pci_dev *dev); -extern int pnv_cxl_cx4_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); -extern void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev *pdev); +/* pci-ioda-tce.c */ +#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_MAX_LEVELS 5 +extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + unsigned long attrs); +extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, + bool alloc); +extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); + +extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + bool alloc_userspace_copy, struct iommu_table *tbl); +extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); -/* phb ops (cxl switches these when enabling the kernel api on the phb) */ -extern const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops; +extern long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, unsigned int page_shift); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index f96df0a25d05..adddde023622 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -78,6 +78,12 @@ static void init_fw_feat_flags(struct device_node *np) if (fw_feature_is("enabled", "fw-count-cache-disabled", np)) security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED); + if (fw_feature_is("enabled", "fw-count-cache-flush-bcctr2,0,0", np)) + security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST); + + if (fw_feature_is("enabled", "needs-count-cache-flush-on-context-switch", np)) + security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE); + /* * The features below are enabled by default, so we instead look to see * if firmware has *disabled* them, and clear them if so. @@ -124,7 +130,7 @@ static void pnv_setup_rfi_flush(void) security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV)); setup_rfi_flush(type, enable); - setup_barrier_nospec(); + setup_count_cache_flush(); } static void __init pnv_setup_arch(void) @@ -314,7 +320,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) u64 reinit_flags; if (xive_enabled()) - xive_kexec_teardown_cpu(secondary); + xive_teardown_cpu(); else xics_kexec_teardown_cpu(secondary); diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index b80909957792..0d354e19ef92 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -283,23 +283,6 @@ static void pnv_cause_ipi(int cpu) ic_cause_ipi(cpu); } -static void pnv_p9_dd1_cause_ipi(int cpu) -{ - int this_cpu = get_cpu(); - - /* - * POWER9 DD1 has a global addressed msgsnd, but for now we restrict - * IPIs to same core, because it requires additional synchronization - * for inter-core doorbells which we do not implement. - */ - if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) - doorbell_global_ipi(cpu); - else - ic_cause_ipi(cpu); - - put_cpu(); -} - static void __init pnv_smp_probe(void) { if (xive_enabled()) @@ -311,14 +294,10 @@ static void __init pnv_smp_probe(void) ic_cause_ipi = smp_ops->cause_ipi; WARN_ON(!ic_cause_ipi); - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi; - else - smp_ops->cause_ipi = doorbell_global_ipi; - } else { + if (cpu_has_feature(CPU_FTR_ARCH_300)) + smp_ops->cause_ipi = doorbell_global_ipi; + else smp_ops->cause_ipi = pnv_cause_ipi; - } } } diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index ae0100fd35bb..f5493dbdd7ff 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -15,6 +15,7 @@ #include <linux/io.h> #include <linux/dcache.h> #include <linux/mutex.h> +#include <linux/stringify.h> /* * Overview of Virtual Accelerator Switchboard (VAS). |