summaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-16 23:47:46 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-16 23:47:46 +0300
commit5b0e2cb020085efe202123162502e0b551e49a0e (patch)
tree534bbb4c9f98c2ed9a520e11107029e5df38c3c2 /arch/powerpc/platforms/powernv
parent758f875848d78148cf9a9cdb3ff1ddf29b234056 (diff)
parent3ffa9d9e2a7c10127d8cbf91ea2be15390b450ed (diff)
downloadlinux-5b0e2cb020085efe202123162502e0b551e49a0e.tar.xz
Merge tag 'powerpc-4.15-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman: "A bit of a small release, I suspect in part due to me travelling for KS. But my backlog of patches to review is smaller than usual, so I think in part folks just didn't send as much this cycle. Non-highlights: - Five fixes for the >128T address space handling, both to fix bugs in our implementation and to bring the semantics exactly into line with x86. Highlights: - Support for a new OPAL call on bare metal machines which gives us a true NMI (ie. is not masked by MSR[EE]=0) for debugging etc. - Support for Power9 DD2 in the CXL driver. - Improvements to machine check handling so that uncorrectable errors can be reported into the generic memory_failure() machinery. - Some fixes and improvements for VPHN, which is used under PowerVM to notify the Linux partition of topology changes. - Plumbing to enable TM (transactional memory) without suspend on some Power9 processors (PPC_FEATURE2_HTM_NO_SUSPEND). - Support for emulating vector loads form cache-inhibited memory, on some Power9 revisions. - Disable the fast-endian switch "syscall" by default (behind a CONFIG), we believe it has never had any users. - A major rework of the API drivers use when initiating and waiting for long running operations performed by OPAL firmware, and changes to the powernv_flash driver to use the new API. - Several fixes for the handling of FP/VMX/VSX while processes are using transactional memory. - Optimisations of TLB range flushes when using the radix MMU on Power9. - Improvements to the VAS facility used to access coprocessors on Power9, and related improvements to the way the NX crypto driver handles requests. - Implementation of PMEM_API and UACCESS_FLUSHCACHE for 64-bit. Thanks to: Alexey Kardashevskiy, Alistair Popple, Allen Pais, Andrew Donnellan, Aneesh Kumar K.V, Arnd Bergmann, Balbir Singh, Benjamin Herrenschmidt, Breno Leitao, Christophe Leroy, Christophe Lombard, Cyril Bur, Frederic Barrat, Gautham R. Shenoy, Geert Uytterhoeven, Guilherme G. Piccoli, Gustavo Romero, Haren Myneni, Joel Stanley, Kamalesh Babulal, Kautuk Consul, Markus Elfring, Masami Hiramatsu, Michael Bringmann, Michael Neuling, Michal Suchanek, Naveen N. Rao, Nicholas Piggin, Oliver O'Halloran, Paul Mackerras, Pedro Miraglia Franco de Carvalho, Philippe Bergheaud, Sandipan Das, Seth Forshee, Shriya, Stephen Rothwell, Stewart Smith, Sukadev Bhattiprolu, Tyrel Datwyler, Vaibhav Jain, Vaidyanathan Srinivasan, and William A. Kennington III" * tag 'powerpc-4.15-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (151 commits) powerpc/64s: Fix Power9 DD2.0 workarounds by adding DD2.1 feature powerpc/64s: Fix masking of SRR1 bits on instruction fault powerpc/64s: mm_context.addr_limit is only used on hash powerpc/64s/radix: Fix 128TB-512TB virtual address boundary case allocation powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary powerpc/64s/hash: Fix fork() with 512TB process address space powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation powerpc/64s/hash: Fix 512T hint detection to use >= 128T powerpc: Fix DABR match on hash based systems powerpc/signal: Properly handle return value from uprobe_deny_signal() powerpc/fadump: use kstrtoint to handle sysfs store powerpc/lib: Implement UACCESS_FLUSHCACHE API powerpc/lib: Implement PMEM API powerpc/powernv/npu: Don't explicitly flush nmmu tlb powerpc/powernv/npu: Use flush_all_mm() instead of flush_tlb_mm() powerpc/powernv/idle: Round up latency and residency values powerpc/kprobes: refactor kprobe_lookup_name for safer string operations powerpc/kprobes: Blacklist emulate_update_regs() from kprobes powerpc/kprobes: Do not disable interrupts for optprobes and kprobes_on_ftrace powerpc/kprobes: Disable preemption before invoking probe handler for optprobes ...
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Makefile3
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c42
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c28
-rw-r--r--arch/powerpc/platforms/powernv/opal-async.c180
-rw-r--r--arch/powerpc/platforms/powernv/opal-hmi.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal-irqchip.c8
-rw-r--r--arch/powerpc/platforms/powernv/opal-memory-errors.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c17
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S5
-rw-r--r--arch/powerpc/platforms/powernv/opal.c2
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c29
-rw-r--r--arch/powerpc/platforms/powernv/pci.h4
-rw-r--r--arch/powerpc/platforms/powernv/setup.c26
-rw-r--r--arch/powerpc/platforms/powernv/smp.c59
-rw-r--r--arch/powerpc/platforms/powernv/vas-debug.c209
-rw-r--r--arch/powerpc/platforms/powernv/vas-window.c242
-rw-r--r--arch/powerpc/platforms/powernv/vas.c31
-rw-r--r--arch/powerpc/platforms/powernv/vas.h93
18 files changed, 795 insertions, 187 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 7a31c26500e6..3732118a0482 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -15,4 +15,5 @@ obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
obj-$(CONFIG_OPAL_PRD) += opal-prd.o
obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o
+obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_FTW) += nx-ftw.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 8864065eba22..4650fb294e7a 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -41,7 +41,6 @@
#include "powernv.h"
#include "pci.h"
-static bool pnv_eeh_nb_init = false;
static int eeh_event_irq = -EINVAL;
static int pnv_eeh_init(void)
@@ -197,31 +196,31 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
* been built. If the I/O cache staff has been built, EEH is
* ready to supply service.
*/
-static int pnv_eeh_post_init(void)
+int pnv_eeh_post_init(void)
{
struct pci_controller *hose;
struct pnv_phb *phb;
int ret = 0;
- /* Register OPAL event notifier */
- if (!pnv_eeh_nb_init) {
- eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
- if (eeh_event_irq < 0) {
- pr_err("%s: Can't register OPAL event interrupt (%d)\n",
- __func__, eeh_event_irq);
- return eeh_event_irq;
- }
+ /* Probe devices & build address cache */
+ eeh_probe_devices();
+ eeh_addr_cache_build();
- ret = request_irq(eeh_event_irq, pnv_eeh_event,
- IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
- if (ret < 0) {
- irq_dispose_mapping(eeh_event_irq);
- pr_err("%s: Can't request OPAL event interrupt (%d)\n",
- __func__, eeh_event_irq);
- return ret;
- }
+ /* Register OPAL event notifier */
+ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
+ if (eeh_event_irq < 0) {
+ pr_err("%s: Can't register OPAL event interrupt (%d)\n",
+ __func__, eeh_event_irq);
+ return eeh_event_irq;
+ }
- pnv_eeh_nb_init = true;
+ ret = request_irq(eeh_event_irq, pnv_eeh_event,
+ IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
+ if (ret < 0) {
+ irq_dispose_mapping(eeh_event_irq);
+ pr_err("%s: Can't request OPAL event interrupt (%d)\n",
+ __func__, eeh_event_irq);
+ return ret;
}
if (!eeh_enabled())
@@ -367,6 +366,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
+ /* Skip if we haven't probed yet */
+ if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE)
+ return NULL;
+
/* Initialize eeh device */
edev->class_code = pdn->class_code;
edev->mode &= 0xFFFFFF00;
@@ -1731,7 +1734,6 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
static struct eeh_ops pnv_eeh_ops = {
.name = "powernv",
.init = pnv_eeh_init,
- .post_init = pnv_eeh_post_init,
.probe = pnv_eeh_probe,
.set_option = pnv_eeh_set_option,
.get_pe_addr = pnv_eeh_get_pe_addr,
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 2cb6cbea4b3b..f6cbc1a71472 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -395,6 +395,7 @@ struct npu_context {
struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
struct mmu_notifier mn;
struct kref kref;
+ bool nmmu_flush;
/* Callback to stop translation requests on a given GPU */
struct npu_context *(*release_cb)(struct npu_context *, void *);
@@ -545,11 +546,13 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
unsigned long pid = npu_context->mm->context.id;
- /*
- * Unfortunately the nest mmu does not support flushing specific
- * addresses so we have to flush the whole mm.
- */
- flush_tlb_mm(npu_context->mm);
+ if (npu_context->nmmu_flush)
+ /*
+ * Unfortunately the nest mmu does not support flushing specific
+ * addresses so we have to flush the whole mm once before
+ * shooting down the GPU translation.
+ */
+ flush_all_mm(npu_context->mm);
/*
* Loop over all the NPUs this process is active on and launch
@@ -722,6 +725,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
return ERR_PTR(-ENODEV);
npu_context->npdev[npu->index][nvlink_index] = npdev;
+ if (!nphb->npu.nmmu_flush) {
+ /*
+ * If we're not explicitly flushing ourselves we need to mark
+ * the thread for global flushes
+ */
+ npu_context->nmmu_flush = false;
+ mm_context_add_copro(mm);
+ } else
+ npu_context->nmmu_flush = true;
+
return npu_context;
}
EXPORT_SYMBOL(pnv_npu2_init_context);
@@ -731,6 +744,9 @@ static void pnv_npu2_release_context(struct kref *kref)
struct npu_context *npu_context =
container_of(kref, struct npu_context, kref);
+ if (!npu_context->nmmu_flush)
+ mm_context_remove_copro(npu_context->mm);
+
npu_context->mm->context.npu_context = NULL;
mmu_notifier_unregister(&npu_context->mn,
npu_context->mm);
@@ -819,6 +835,8 @@ int pnv_npu2_init(struct pnv_phb *phb)
static int npu_index;
uint64_t rc = 0;
+ phb->npu.nmmu_flush =
+ of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
for_each_child_of_node(phb->hose->dn, dn) {
gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
if (gpdev) {
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index cf33769a7b72..18a355fa15e8 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -1,7 +1,7 @@
/*
* PowerNV OPAL asynchronous completion interfaces
*
- * Copyright 2013 IBM Corp.
+ * Copyright 2013-2017 IBM Corp.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -23,40 +23,50 @@
#include <asm/machdep.h>
#include <asm/opal.h>
-#define N_ASYNC_COMPLETIONS 64
+enum opal_async_token_state {
+ ASYNC_TOKEN_UNALLOCATED = 0,
+ ASYNC_TOKEN_ALLOCATED,
+ ASYNC_TOKEN_DISPATCHED,
+ ASYNC_TOKEN_ABANDONED,
+ ASYNC_TOKEN_COMPLETED
+};
+
+struct opal_async_token {
+ enum opal_async_token_state state;
+ struct opal_msg response;
+};
-static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL};
-static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS);
static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
static DEFINE_SPINLOCK(opal_async_comp_lock);
static struct semaphore opal_async_sem;
-static struct opal_msg *opal_async_responses;
static unsigned int opal_max_async_tokens;
+static struct opal_async_token *opal_async_tokens;
-int __opal_async_get_token(void)
+static int __opal_async_get_token(void)
{
unsigned long flags;
- int token;
+ int i, token = -EBUSY;
spin_lock_irqsave(&opal_async_comp_lock, flags);
- token = find_first_bit(opal_async_complete_map, opal_max_async_tokens);
- if (token >= opal_max_async_tokens) {
- token = -EBUSY;
- goto out;
- }
- if (__test_and_set_bit(token, opal_async_token_map)) {
- token = -EBUSY;
- goto out;
+ for (i = 0; i < opal_max_async_tokens; i++) {
+ if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) {
+ opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED;
+ token = i;
+ break;
+ }
}
- __clear_bit(token, opal_async_complete_map);
-
-out:
spin_unlock_irqrestore(&opal_async_comp_lock, flags);
return token;
}
+/*
+ * Note: If the returned token is used in an opal call and opal returns
+ * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or
+ * opal_async_wait_response_interruptible() at least once before calling another
+ * opal_async_* function
+ */
int opal_async_get_token_interruptible(void)
{
int token;
@@ -73,9 +83,10 @@ int opal_async_get_token_interruptible(void)
}
EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
-int __opal_async_release_token(int token)
+static int __opal_async_release_token(int token)
{
unsigned long flags;
+ int rc;
if (token < 0 || token >= opal_max_async_tokens) {
pr_err("%s: Passed token is out of range, token %d\n",
@@ -84,11 +95,26 @@ int __opal_async_release_token(int token)
}
spin_lock_irqsave(&opal_async_comp_lock, flags);
- __set_bit(token, opal_async_complete_map);
- __clear_bit(token, opal_async_token_map);
+ switch (opal_async_tokens[token].state) {
+ case ASYNC_TOKEN_COMPLETED:
+ case ASYNC_TOKEN_ALLOCATED:
+ opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED;
+ rc = 0;
+ break;
+ /*
+ * DISPATCHED and ABANDONED tokens must wait for OPAL to respond.
+ * Mark a DISPATCHED token as ABANDONED so that the response handling
+ * code knows no one cares and that it can free it then.
+ */
+ case ASYNC_TOKEN_DISPATCHED:
+ opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
+ /* Fall through */
+ default:
+ rc = 1;
+ }
spin_unlock_irqrestore(&opal_async_comp_lock, flags);
- return 0;
+ return rc;
}
int opal_async_release_token(int token)
@@ -96,12 +122,10 @@ int opal_async_release_token(int token)
int ret;
ret = __opal_async_release_token(token);
- if (ret)
- return ret;
-
- up(&opal_async_sem);
+ if (!ret)
+ up(&opal_async_sem);
- return 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(opal_async_release_token);
@@ -117,22 +141,83 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
return -EINVAL;
}
- /* Wakeup the poller before we wait for events to speed things
+ /*
+ * There is no need to mark the token as dispatched, wait_event()
+ * will block until the token completes.
+ *
+ * Wakeup the poller before we wait for events to speed things
* up on platforms or simulators where the interrupts aren't
* functional.
*/
opal_wake_poller();
- wait_event(opal_async_wait, test_bit(token, opal_async_complete_map));
- memcpy(msg, &opal_async_responses[token], sizeof(*msg));
+ wait_event(opal_async_wait, opal_async_tokens[token].state
+ == ASYNC_TOKEN_COMPLETED);
+ memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
return 0;
}
EXPORT_SYMBOL_GPL(opal_async_wait_response);
+int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg)
+{
+ unsigned long flags;
+ int ret;
+
+ if (token >= opal_max_async_tokens) {
+ pr_err("%s: Invalid token passed\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!msg) {
+ pr_err("%s: Invalid message pointer passed\n", __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * The first time this gets called we mark the token as DISPATCHED
+ * so that if wait_event_interruptible() returns not zero and the
+ * caller frees the token, we know not to actually free the token
+ * until the response comes.
+ *
+ * Only change if the token is ALLOCATED - it may have been
+ * completed even before the caller gets around to calling this
+ * the first time.
+ *
+ * There is also a dirty great comment at the token allocation
+ * function that if the opal call returns OPAL_ASYNC_COMPLETION to
+ * the caller then the caller *must* call this or the not
+ * interruptible version before doing anything else with the
+ * token.
+ */
+ if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) {
+ spin_lock_irqsave(&opal_async_comp_lock, flags);
+ if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED)
+ opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED;
+ spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+ }
+
+ /*
+ * Wakeup the poller before we wait for events to speed things
+ * up on platforms or simulators where the interrupts aren't
+ * functional.
+ */
+ opal_wake_poller();
+ ret = wait_event_interruptible(opal_async_wait,
+ opal_async_tokens[token].state ==
+ ASYNC_TOKEN_COMPLETED);
+ if (!ret)
+ memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible);
+
+/* Called from interrupt context */
static int opal_async_comp_event(struct notifier_block *nb,
unsigned long msg_type, void *msg)
{
struct opal_msg *comp_msg = msg;
+ enum opal_async_token_state state;
unsigned long flags;
uint64_t token;
@@ -140,11 +225,17 @@ static int opal_async_comp_event(struct notifier_block *nb,
return 0;
token = be64_to_cpu(comp_msg->params[0]);
- memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg));
spin_lock_irqsave(&opal_async_comp_lock, flags);
- __set_bit(token, opal_async_complete_map);
+ state = opal_async_tokens[token].state;
+ opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED;
spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+ if (state == ASYNC_TOKEN_ABANDONED) {
+ /* Free the token, no one else will */
+ opal_async_release_token(token);
+ return 0;
+ }
+ memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
wake_up(&opal_async_wait);
return 0;
@@ -178,32 +269,23 @@ int __init opal_async_comp_init(void)
}
opal_max_async_tokens = be32_to_cpup(async);
- if (opal_max_async_tokens > N_ASYNC_COMPLETIONS)
- opal_max_async_tokens = N_ASYNC_COMPLETIONS;
+ opal_async_tokens = kcalloc(opal_max_async_tokens,
+ sizeof(*opal_async_tokens), GFP_KERNEL);
+ if (!opal_async_tokens) {
+ err = -ENOMEM;
+ goto out_opal_node;
+ }
err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP,
&opal_async_comp_nb);
if (err) {
pr_err("%s: Can't register OPAL event notifier (%d)\n",
__func__, err);
+ kfree(opal_async_tokens);
goto out_opal_node;
}
- opal_async_responses = kzalloc(
- sizeof(*opal_async_responses) * opal_max_async_tokens,
- GFP_KERNEL);
- if (!opal_async_responses) {
- pr_err("%s: Out of memory, failed to do asynchronous "
- "completion init\n", __func__);
- err = -ENOMEM;
- goto out_opal_node;
- }
-
- /* Initialize to 1 less than the maximum tokens available, as we may
- * require to pop one during emergency through synchronous call to
- * __opal_async_get_token()
- */
- sema_init(&opal_async_sem, opal_max_async_tokens - 1);
+ sema_init(&opal_async_sem, opal_max_async_tokens);
out_opal_node:
of_node_put(opal_node);
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index d78fed728cdf..c9e1a4ff295c 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -1,5 +1,5 @@
/*
- * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
+ * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index ecdcba9d1220..9d1b8c0aaf93 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -174,8 +174,14 @@ void opal_event_shutdown(void)
/* First free interrupts, which will also mask them */
for (i = 0; i < opal_irq_count; i++) {
- if (opal_irqs[i])
+ if (!opal_irqs[i])
+ continue;
+
+ if (in_interrupt())
+ disable_irq_nosync(opal_irqs[i]);
+ else
free_irq(opal_irqs[i], NULL);
+
opal_irqs[i] = 0;
}
}
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index 4495f428b500..d9916ea62305 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -1,5 +1,5 @@
/*
- * OPAL asynchronus Memory error handling support in PowreNV.
+ * OPAL asynchronus Memory error handling support in PowerNV.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index aa267f120033..0a7074bb91dc 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -19,13 +19,10 @@
*/
#include <linux/delay.h>
-#include <linux/mutex.h>
#include <linux/of_platform.h>
#include <asm/opal.h>
#include <asm/machdep.h>
-static DEFINE_MUTEX(opal_sensor_mutex);
-
/*
* This will return sensor information to driver based on the requested sensor
* handle. A handle is an opaque id for the powernv, read by the driver from the
@@ -38,13 +35,9 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
__be32 data;
token = opal_async_get_token_interruptible();
- if (token < 0) {
- pr_err("%s: Couldn't get the token, returning\n", __func__);
- ret = token;
- goto out;
- }
+ if (token < 0)
+ return token;
- mutex_lock(&opal_sensor_mutex);
ret = opal_sensor_read(sensor_hndl, token, &data);
switch (ret) {
case OPAL_ASYNC_COMPLETION:
@@ -52,7 +45,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
if (ret) {
pr_err("%s: Failed to wait for the async response, %d\n",
__func__, ret);
- goto out_token;
+ goto out;
}
ret = opal_error_code(opal_get_async_rc(msg));
@@ -73,10 +66,8 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
break;
}
-out_token:
- mutex_unlock(&opal_sensor_mutex);
- opal_async_release_token(token);
out:
+ opal_async_release_token(token);
return ret;
}
EXPORT_SYMBOL_GPL(opal_get_sensor_data);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..6f4b00a2ac46 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -94,7 +94,7 @@ opal_return:
* bytes (always BE) since MSR:LE will end up fixed up as a side
* effect of the rfid.
*/
- FIXUP_ENDIAN
+ FIXUP_ENDIAN_HV
ld r2,PACATOC(r13);
lwz r4,8(r1);
ld r5,PPC_LR_STKOFF(r1);
@@ -120,7 +120,7 @@ opal_real_call:
hrfid
opal_return_realmode:
- FIXUP_ENDIAN
+ FIXUP_ENDIAN_HV
ld r2,PACATOC(r13);
lwz r11,8(r1);
ld r12,PPC_LR_STKOFF(r1)
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 65c79ecf5a4d..041ddbd1fc57 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -998,6 +998,7 @@ int opal_error_code(int rc)
case OPAL_PARAMETER: return -EINVAL;
case OPAL_ASYNC_COMPLETION: return -EINPROGRESS;
+ case OPAL_BUSY:
case OPAL_BUSY_EVENT: return -EBUSY;
case OPAL_NO_MEM: return -ENOMEM;
case OPAL_PERMISSION: return -EPERM;
@@ -1037,3 +1038,4 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
/* Export this for KVM */
EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
EXPORT_SYMBOL_GPL(opal_int_eoi);
+EXPORT_SYMBOL_GPL(opal_error_code);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 57f9e55f4352..749055553064 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1002,9 +1002,12 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
}
/*
- * After doing so, there would be a "hole" in the /proc/iomem when
- * offset is a positive value. It looks like the device return some
- * mmio back to the system, which actually no one could use it.
+ * Since M64 BAR shares segments among all possible 256 PEs,
+ * we have to shift the beginning of PF IOV BAR to make it start from
+ * the segment which belongs to the PE number assigned to the first VF.
+ * This creates a "hole" in the /proc/iomem which could be used for
+ * allocating other resources so we reserve this area below and
+ * release when IOV is released.
*/
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
res = &dev->resource[i + PCI_IOV_RESOURCES];
@@ -1018,7 +1021,22 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
i, &res2, res, (offset > 0) ? "En" : "Dis",
num_vfs, offset);
+
+ if (offset < 0) {
+ devm_release_resource(&dev->dev, &pdn->holes[i]);
+ memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
+ }
+
pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+ if (offset > 0) {
+ pdn->holes[i].start = res2.start;
+ pdn->holes[i].end = res2.start + size * offset - 1;
+ pdn->holes[i].flags = IORESOURCE_BUS;
+ pdn->holes[i].name = "pnv_iov_reserved";
+ devm_request_resource(&dev->dev, res->parent,
+ &pdn->holes[i]);
+ }
}
return 0;
}
@@ -2779,7 +2797,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
return -EINVAL;
- if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+ if (!is_power_of_2(window_size))
return -EINVAL;
/* Adjust direct table size from window_size and levels */
@@ -3293,8 +3311,7 @@ static void pnv_pci_ioda_fixup(void)
pnv_pci_ioda_create_dbgfs();
#ifdef CONFIG_EEH
- eeh_init();
- eeh_addr_cache_build();
+ pnv_eeh_post_init();
#endif
}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index b47f9406d97e..b772d7473896 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -188,6 +188,9 @@ struct pnv_phb {
/* Bitmask for MMIO register usage */
unsigned long mmio_atsd_usage;
+
+ /* Do we need to explicitly flush the nest mmu? */
+ bool nmmu_flush;
} npu;
#ifdef CONFIG_CXL_BASE
@@ -235,6 +238,7 @@ extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
extern bool pnv_pci_enable_device_hook(struct pci_dev *dev);
extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern int pnv_eeh_post_init(void);
extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index bbb73aa0eb8f..1edfbc1e40f4 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -36,6 +36,7 @@
#include <asm/opal.h>
#include <asm/kexec.h>
#include <asm/smp.h>
+#include <asm/tm.h>
#include "powernv.h"
@@ -290,6 +291,7 @@ static void __init pnv_setup_machdep_opal(void)
ppc_md.restart = pnv_restart;
pm_power_off = pnv_power_off;
ppc_md.halt = pnv_halt;
+ /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
ppc_md.machine_check_exception = opal_machine_check;
ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
ppc_md.hmi_exception_early = opal_hmi_exception_early;
@@ -311,6 +313,28 @@ static int __init pnv_probe(void)
return 1;
}
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void __init pnv_tm_init(void)
+{
+ if (!firmware_has_feature(FW_FEATURE_OPAL) ||
+ !pvr_version_is(PVR_POWER9) ||
+ early_cpu_has_feature(CPU_FTR_TM))
+ return;
+
+ if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS)
+ return;
+
+ pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n");
+ cur_cpu_spec->cpu_features |= CPU_FTR_TM;
+ /* Make sure "normal" HTM is off (it should be) */
+ cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM;
+ /* Turn on no suspend mode, and HTM no SC */
+ cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \
+ PPC_FEATURE2_HTM_NOSC;
+ tm_suspend_disabled = true;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
/*
* Returns the cpu frequency for 'cpu' in Hz. This is used by
* /proc/cpuinfo
@@ -319,7 +343,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu)
{
unsigned long ret_freq;
- ret_freq = cpufreq_quick_get(cpu) * 1000ul;
+ ret_freq = cpufreq_get(cpu) * 1000ul;
/*
* If the backend cpufreq driver does not exist,
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..ba030669eca1 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -49,6 +49,13 @@
static void pnv_smp_setup_cpu(int cpu)
{
+ /*
+ * P9 workaround for CI vector load (see traps.c),
+ * enable the corresponding HMI interrupt
+ */
+ if (pvr_version_is(PVR_POWER9))
+ mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17));
+
if (xive_enabled())
xive_smp_setup_cpu();
else if (cpu != boot_cpuid)
@@ -290,6 +297,54 @@ static void __init pnv_smp_probe(void)
}
}
+static int pnv_system_reset_exception(struct pt_regs *regs)
+{
+ if (smp_handle_nmi_ipi(regs))
+ return 1;
+ return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+ int64_t rc;
+
+ if (cpu >= 0) {
+ rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+ if (rc != OPAL_SUCCESS)
+ return 0;
+ return 1;
+
+ } else if (cpu == NMI_IPI_ALL_OTHERS) {
+ bool success = true;
+ int c;
+
+
+ /*
+ * We do not use broadcasts (yet), because it's not clear
+ * exactly what semantics Linux wants or the firmware should
+ * provide.
+ */
+ for_each_online_cpu(c) {
+ if (c == smp_processor_id())
+ continue;
+
+ rc = opal_signal_system_reset(
+ get_hard_smp_processor_id(c));
+ if (rc != OPAL_SUCCESS)
+ success = false;
+ }
+ if (success)
+ return 1;
+
+ /*
+ * Caller will fall back to doorbells, which may pick
+ * up the remainders.
+ */
+ }
+
+ return 0;
+}
+
static struct smp_ops_t pnv_smp_ops = {
.message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
.cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +363,10 @@ static struct smp_ops_t pnv_smp_ops = {
/* This is called very early during platform setup_arch */
void __init pnv_smp_init(void)
{
+ if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) {
+ ppc_md.system_reset_exception = pnv_system_reset_exception;
+ pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
+ }
smp_ops = &pnv_smp_ops;
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
new file mode 100644
index 000000000000..ca22f1eae050
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "vas.h"
+
+static struct dentry *vas_debugfs;
+
+static char *cop_to_str(int cop)
+{
+ switch (cop) {
+ case VAS_COP_TYPE_FAULT: return "Fault";
+ case VAS_COP_TYPE_842: return "NX-842 Normal Priority";
+ case VAS_COP_TYPE_842_HIPRI: return "NX-842 High Priority";
+ case VAS_COP_TYPE_GZIP: return "NX-GZIP Normal Priority";
+ case VAS_COP_TYPE_GZIP_HIPRI: return "NX-GZIP High Priority";
+ case VAS_COP_TYPE_FTW: return "Fast Thread-wakeup";
+ default: return "Unknown";
+ }
+}
+
+static int info_dbg_show(struct seq_file *s, void *private)
+{
+ struct vas_window *window = s->private;
+
+ mutex_lock(&vas_mutex);
+
+ /* ensure window is not unmapped */
+ if (!window->hvwc_map)
+ goto unlock;
+
+ seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+ window->tx_win ? "Send" : "Receive");
+ seq_printf(s, "Pid : %d\n", window->pid);
+
+unlock:
+ mutex_unlock(&vas_mutex);
+ return 0;
+}
+
+static int info_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, info_dbg_show, inode->i_private);
+}
+
+static const struct file_operations info_fops = {
+ .open = info_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static inline void print_reg(struct seq_file *s, struct vas_window *win,
+ char *name, u32 reg)
+{
+ seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
+}
+
+static int hvwc_dbg_show(struct seq_file *s, void *private)
+{
+ struct vas_window *window = s->private;
+
+ mutex_lock(&vas_mutex);
+
+ /* ensure window is not unmapped */
+ if (!window->hvwc_map)
+ goto unlock;
+
+ print_reg(s, window, VREG(LPID));
+ print_reg(s, window, VREG(PID));
+ print_reg(s, window, VREG(XLATE_MSR));
+ print_reg(s, window, VREG(XLATE_LPCR));
+ print_reg(s, window, VREG(XLATE_CTL));
+ print_reg(s, window, VREG(AMR));
+ print_reg(s, window, VREG(SEIDR));
+ print_reg(s, window, VREG(FAULT_TX_WIN));
+ print_reg(s, window, VREG(OSU_INTR_SRC_RA));
+ print_reg(s, window, VREG(HV_INTR_SRC_RA));
+ print_reg(s, window, VREG(PSWID));
+ print_reg(s, window, VREG(LFIFO_BAR));
+ print_reg(s, window, VREG(LDATA_STAMP_CTL));
+ print_reg(s, window, VREG(LDMA_CACHE_CTL));
+ print_reg(s, window, VREG(LRFIFO_PUSH));
+ print_reg(s, window, VREG(CURR_MSG_COUNT));
+ print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT));
+ print_reg(s, window, VREG(LRX_WCRED));
+ print_reg(s, window, VREG(LRX_WCRED_ADDER));
+ print_reg(s, window, VREG(TX_WCRED));
+ print_reg(s, window, VREG(TX_WCRED_ADDER));
+ print_reg(s, window, VREG(LFIFO_SIZE));
+ print_reg(s, window, VREG(WINCTL));
+ print_reg(s, window, VREG(WIN_STATUS));
+ print_reg(s, window, VREG(WIN_CTX_CACHING_CTL));
+ print_reg(s, window, VREG(TX_RSVD_BUF_COUNT));
+ print_reg(s, window, VREG(LRFIFO_WIN_PTR));
+ print_reg(s, window, VREG(LNOTIFY_CTL));
+ print_reg(s, window, VREG(LNOTIFY_PID));
+ print_reg(s, window, VREG(LNOTIFY_LPID));
+ print_reg(s, window, VREG(LNOTIFY_TID));
+ print_reg(s, window, VREG(LNOTIFY_SCOPE));
+ print_reg(s, window, VREG(NX_UTIL_ADDER));
+unlock:
+ mutex_unlock(&vas_mutex);
+ return 0;
+}
+
+static int hvwc_dbg_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hvwc_dbg_show, inode->i_private);
+}
+
+static const struct file_operations hvwc_fops = {
+ .open = hvwc_dbg_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+void vas_window_free_dbgdir(struct vas_window *window)
+{
+ if (window->dbgdir) {
+ debugfs_remove_recursive(window->dbgdir);
+ kfree(window->dbgname);
+ window->dbgdir = NULL;
+ window->dbgname = NULL;
+ }
+}
+
+void vas_window_init_dbgdir(struct vas_window *window)
+{
+ struct dentry *f, *d;
+
+ if (!window->vinst->dbgdir)
+ return;
+
+ window->dbgname = kzalloc(16, GFP_KERNEL);
+ if (!window->dbgname)
+ return;
+
+ snprintf(window->dbgname, 16, "w%d", window->winid);
+
+ d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir);
+ if (IS_ERR(d))
+ goto free_name;
+
+ window->dbgdir = d;
+
+ f = debugfs_create_file("info", 0444, d, window, &info_fops);
+ if (IS_ERR(f))
+ goto remove_dir;
+
+ f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
+ if (IS_ERR(f))
+ goto remove_dir;
+
+ return;
+
+free_name:
+ kfree(window->dbgname);
+ window->dbgname = NULL;
+
+remove_dir:
+ debugfs_remove_recursive(window->dbgdir);
+ window->dbgdir = NULL;
+}
+
+void vas_instance_init_dbgdir(struct vas_instance *vinst)
+{
+ struct dentry *d;
+
+ if (!vas_debugfs)
+ return;
+
+ vinst->dbgname = kzalloc(16, GFP_KERNEL);
+ if (!vinst->dbgname)
+ return;
+
+ snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id);
+
+ d = debugfs_create_dir(vinst->dbgname, vas_debugfs);
+ if (IS_ERR(d))
+ goto free_name;
+
+ vinst->dbgdir = d;
+ return;
+
+free_name:
+ kfree(vinst->dbgname);
+ vinst->dbgname = NULL;
+ vinst->dbgdir = NULL;
+}
+
+void vas_init_dbgdir(void)
+{
+ vas_debugfs = debugfs_create_dir("vas", NULL);
+ if (IS_ERR(vas_debugfs))
+ vas_debugfs = NULL;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 5aae845b8cd9..2b3eb01ab110 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -16,7 +16,8 @@
#include <linux/log2.h>
#include <linux/rcupdate.h>
#include <linux/cred.h>
-
+#include <asm/switch_to.h>
+#include <asm/ppc-opcode.h>
#include "vas.h"
#include "copy-paste.h"
@@ -40,6 +41,16 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len
pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
}
+u64 vas_win_paste_addr(struct vas_window *win)
+{
+ u64 addr;
+
+ compute_paste_address(win, &addr, NULL);
+
+ return addr;
+}
+EXPORT_SYMBOL(vas_win_paste_addr);
+
static inline void get_hvwc_mmio_bar(struct vas_window *window,
u64 *start, int *len)
{
@@ -145,23 +156,37 @@ static void unmap_paste_region(struct vas_window *window)
}
/*
- * Unmap the MMIO regions for a window.
+ * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't
+ * unmap when the window's debugfs dir is in use. This serializes close
+ * of a window even on another VAS instance but since its not a critical
+ * path, just minimize the time we hold the mutex for now. We can add
+ * a per-instance mutex later if necessary.
*/
static void unmap_winctx_mmio_bars(struct vas_window *window)
{
int len;
+ void *uwc_map;
+ void *hvwc_map;
u64 busaddr_start;
- if (window->hvwc_map) {
+ mutex_lock(&vas_mutex);
+
+ hvwc_map = window->hvwc_map;
+ window->hvwc_map = NULL;
+
+ uwc_map = window->uwc_map;
+ window->uwc_map = NULL;
+
+ mutex_unlock(&vas_mutex);
+
+ if (hvwc_map) {
get_hvwc_mmio_bar(window, &busaddr_start, &len);
- unmap_region(window->hvwc_map, busaddr_start, len);
- window->hvwc_map = NULL;
+ unmap_region(hvwc_map, busaddr_start, len);
}
- if (window->uwc_map) {
+ if (uwc_map) {
get_uwc_mmio_bar(window, &busaddr_start, &len);
- unmap_region(window->uwc_map, busaddr_start, len);
- window->uwc_map = NULL;
+ unmap_region(uwc_map, busaddr_start, len);
}
}
@@ -528,6 +553,9 @@ static void vas_window_free(struct vas_window *window)
struct vas_instance *vinst = window->vinst;
unmap_winctx_mmio_bars(window);
+
+ vas_window_free_dbgdir(window);
+
kfree(window);
vas_release_window_id(&vinst->ida, winid);
@@ -552,6 +580,8 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
if (map_winctx_mmio_bars(window))
goto out_free;
+ vas_window_init_dbgdir(window);
+
return window;
out_free:
@@ -569,6 +599,32 @@ static void put_rx_win(struct vas_window *rxwin)
}
/*
+ * Find the user space receive window given the @pswid.
+ * - We must have a valid vasid and it must belong to this instance.
+ * (so both send and receive windows are on the same VAS instance)
+ * - The window must refer to an OPEN, FTW, RECEIVE window.
+ *
+ * NOTE: We access ->windows[] table and assume that vinst->mutex is held.
+ */
+static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+{
+ int vasid, winid;
+ struct vas_window *rxwin;
+
+ decode_pswid(pswid, &vasid, &winid);
+
+ if (vinst->vas_id != vasid)
+ return ERR_PTR(-EINVAL);
+
+ rxwin = vinst->windows[winid];
+
+ if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+ return ERR_PTR(-EINVAL);
+
+ return rxwin;
+}
+
+/*
* Get the VAS receive window associated with NX engine identified
* by @cop and if applicable, @pswid.
*
@@ -581,10 +637,10 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
mutex_lock(&vinst->mutex);
- if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI)
- rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
+ if (cop == VAS_COP_TYPE_FTW)
+ rxwin = get_user_rxwin(vinst, pswid);
else
- rxwin = ERR_PTR(-EINVAL);
+ rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
if (!IS_ERR(rxwin))
atomic_inc(&rxwin->num_txwins);
@@ -674,15 +730,18 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
winctx->rx_fifo = rxattr->rx_fifo;
winctx->rx_fifo_size = rxattr->rx_fifo_size;
- winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+ winctx->wcreds_max = rxwin->wcreds_max;
winctx->pin_win = rxattr->pin_win;
winctx->nx_win = rxattr->nx_win;
winctx->fault_win = rxattr->fault_win;
+ winctx->user_win = rxattr->user_win;
+ winctx->rej_no_credit = rxattr->rej_no_credit;
winctx->rx_word_mode = rxattr->rx_win_ord_mode;
winctx->tx_word_mode = rxattr->tx_win_ord_mode;
winctx->rx_wcred_mode = rxattr->rx_wcred_mode;
winctx->tx_wcred_mode = rxattr->tx_wcred_mode;
+ winctx->notify_early = rxattr->notify_early;
if (winctx->nx_win) {
winctx->data_stamp = true;
@@ -723,7 +782,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
static bool rx_win_args_valid(enum vas_cop_type cop,
struct vas_rx_win_attr *attr)
{
- dump_rx_win_attr(attr);
+ pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n",
+ attr->fault_win, attr->notify_disable,
+ attr->intr_disable, attr->notify_early,
+ attr->rx_fifo_size);
if (cop >= VAS_COP_TYPE_MAX)
return false;
@@ -735,6 +797,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
return false;
+ if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+ return false;
+
if (attr->nx_win) {
/* cannot be fault or user window if it is nx */
if (attr->fault_win || attr->user_win)
@@ -835,6 +900,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
rxwin->nx_win = rxattr->nx_win;
rxwin->user_win = rxattr->user_win;
rxwin->cop = cop;
+ rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
if (rxattr->user_win)
rxwin->pid = task_pid_vnr(current);
@@ -884,21 +950,23 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
*/
memset(winctx, 0, sizeof(struct vas_winctx));
- winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+ winctx->wcreds_max = txwin->wcreds_max;
winctx->user_win = txattr->user_win;
winctx->nx_win = txwin->rxwin->nx_win;
winctx->pin_win = txattr->pin_win;
+ winctx->rej_no_credit = txattr->rej_no_credit;
+ winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable;
winctx->rx_wcred_mode = txattr->rx_wcred_mode;
winctx->tx_wcred_mode = txattr->tx_wcred_mode;
winctx->rx_word_mode = txattr->rx_win_ord_mode;
winctx->tx_word_mode = txattr->tx_win_ord_mode;
+ winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
- if (winctx->nx_win) {
+ winctx->intr_disable = true;
+ if (winctx->nx_win)
winctx->data_stamp = true;
- winctx->intr_disable = true;
- }
winctx->lpid = txattr->lpid;
winctx->pidr = txattr->pidr;
@@ -921,6 +989,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (cop > VAS_COP_TYPE_MAX)
return false;
+ if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
+ return false;
+
if (attr->user_win &&
(cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
return false;
@@ -940,6 +1011,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
if (!tx_win_args_valid(cop, attr))
return ERR_PTR(-EINVAL);
+ /*
+ * If caller did not specify a vasid but specified the PSWID of a
+ * receive window (applicable only to FTW windows), use the vasid
+ * from that receive window.
+ */
+ if (vasid == -1 && attr->pswid)
+ decode_pswid(attr->pswid, &vasid, NULL);
+
vinst = find_vas_instance(vasid);
if (!vinst) {
pr_devel("vasid %d not found!\n", vasid);
@@ -958,11 +1037,13 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
goto put_rxwin;
}
+ txwin->cop = cop;
txwin->tx_win = 1;
txwin->rxwin = rxwin;
txwin->nx_win = txwin->rxwin->nx_win;
txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
+ txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
init_winctx_for_txwin(txwin, attr, &winctx);
@@ -984,6 +1065,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
}
}
+ /*
+ * Now that we have a send window, ensure context switch issues
+ * CP_ABORT for this thread.
+ */
+ rc = -EINVAL;
+ if (set_thread_uses_vas() < 0)
+ goto free_window;
+
set_vinst_win(vinst, txwin);
return txwin;
@@ -1038,50 +1127,110 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
else
rc = -EINVAL;
- print_fifo_msg_count(txwin);
+ pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+ read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
return rc;
}
EXPORT_SYMBOL_GPL(vas_paste_crb);
+/*
+ * If credit checking is enabled for this window, poll for the return
+ * of window credits (i.e for NX engines to process any outstanding CRBs).
+ * Since NX-842 waits for the CRBs to be processed before closing the
+ * window, we should not have to wait for too long.
+ *
+ * TODO: We retry in 10ms intervals now. We could/should probably peek at
+ * the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending
+ * CRBs on the FIFO and compute the delay dynamically on each retry.
+ * But that is not really needed until we support NX-GZIP access from
+ * user space. (NX-842 driver waits for CSB and Fast thread-wakeup
+ * doesn't use credit checking).
+ */
+static void poll_window_credits(struct vas_window *window)
+{
+ u64 val;
+ int creds, mode;
+
+ val = read_hvwc_reg(window, VREG(WINCTL));
+ if (window->tx_win)
+ mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val);
+ else
+ mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val);
+
+ if (!mode)
+ return;
+retry:
+ if (window->tx_win) {
+ val = read_hvwc_reg(window, VREG(TX_WCRED));
+ creds = GET_FIELD(VAS_TX_WCRED, val);
+ } else {
+ val = read_hvwc_reg(window, VREG(LRX_WCRED));
+ creds = GET_FIELD(VAS_LRX_WCRED, val);
+ }
+
+ if (creds < window->wcreds_max) {
+ val = 0;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(msecs_to_jiffies(10));
+ goto retry;
+ }
+}
+
+/*
+ * Wait for the window to go to "not-busy" state. It should only take a
+ * short time to queue a CRB, so window should not be busy for too long.
+ * Trying 5ms intervals.
+ */
static void poll_window_busy_state(struct vas_window *window)
{
int busy;
u64 val;
retry:
- /*
- * Poll Window Busy flag
- */
val = read_hvwc_reg(window, VREG(WIN_STATUS));
busy = GET_FIELD(VAS_WIN_BUSY, val);
if (busy) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
+ schedule_timeout(msecs_to_jiffies(5));
goto retry;
}
}
+/*
+ * Have the hardware cast a window out of cache and wait for it to
+ * be completed.
+ *
+ * NOTE: It can take a relatively long time to cast the window context
+ * out of the cache. It is not strictly necessary to cast out if:
+ *
+ * - we clear the "Pin Window" bit (so hardware is free to evict)
+ *
+ * - we re-initialize the window context when it is reassigned.
+ *
+ * We do the former in vas_win_close() and latter in vas_win_open().
+ * So, ignoring the cast-out for now. We can add it as needed. If
+ * casting out becomes necessary we should consider offloading the
+ * job to a worker thread, so the window close can proceed quickly.
+ */
static void poll_window_castout(struct vas_window *window)
{
- int cached;
- u64 val;
+ /* stub for now */
+}
- /* Cast window context out of the cache */
-retry:
- val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL));
- cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val);
- if (cached) {
- val = 0ULL;
- val = SET_FIELD(VAS_CASTOUT_REQ, val, 1);
- val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0);
- write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
+/*
+ * Unpin and close a window so no new requests are accepted and the
+ * hardware can evict this window from cache if necessary.
+ */
+static void unpin_close_window(struct vas_window *window)
+{
+ u64 val;
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(HZ);
- goto retry;
- }
+ val = read_hvwc_reg(window, VREG(WINCTL));
+ val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
+ val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
+ write_hvwc_reg(window, VREG(WINCTL), val);
}
/*
@@ -1098,8 +1247,6 @@ retry:
*/
int vas_win_close(struct vas_window *window)
{
- u64 val;
-
if (!window)
return 0;
@@ -1115,11 +1262,9 @@ int vas_win_close(struct vas_window *window)
poll_window_busy_state(window);
- /* Unpin window from cache and close it */
- val = read_hvwc_reg(window, VREG(WINCTL));
- val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
- val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
- write_hvwc_reg(window, VREG(WINCTL), val);
+ unpin_close_window(window);
+
+ poll_window_credits(window);
poll_window_castout(window);
@@ -1132,3 +1277,12 @@ int vas_win_close(struct vas_window *window)
return 0;
}
EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return a system-wide unique window id for the window @win.
+ */
+u32 vas_win_id(struct vas_window *win)
+{
+ return encode_pswid(win->vinst->vas_id, win->winid);
+}
+EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index 565a4878fefa..c488621dbec3 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -18,15 +18,18 @@
#include <linux/of_platform.h>
#include <linux/of_address.h>
#include <linux/of.h>
+#include <asm/prom.h>
#include "vas.h"
-static DEFINE_MUTEX(vas_mutex);
+DEFINE_MUTEX(vas_mutex);
static LIST_HEAD(vas_instances);
+static DEFINE_PER_CPU(int, cpu_vas_id);
+
static int init_vas_instance(struct platform_device *pdev)
{
- int rc, vasid;
+ int rc, cpu, vasid;
struct resource *res;
struct vas_instance *vinst;
struct device_node *dn = pdev->dev.of_node;
@@ -74,10 +77,17 @@ static int init_vas_instance(struct platform_device *pdev)
"paste_win_id_shift 0x%llx\n", pdev->name, vasid,
vinst->paste_base_addr, vinst->paste_win_id_shift);
+ for_each_possible_cpu(cpu) {
+ if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
+ per_cpu(cpu_vas_id, cpu) = vasid;
+ }
+
mutex_lock(&vas_mutex);
list_add(&vinst->node, &vas_instances);
mutex_unlock(&vas_mutex);
+ vas_instance_init_dbgdir(vinst);
+
dev_set_drvdata(&pdev->dev, vinst);
return 0;
@@ -98,6 +108,10 @@ struct vas_instance *find_vas_instance(int vasid)
struct vas_instance *vinst;
mutex_lock(&vas_mutex);
+
+ if (vasid == -1)
+ vasid = per_cpu(cpu_vas_id, smp_processor_id());
+
list_for_each(ent, &vas_instances) {
vinst = list_entry(ent, struct vas_instance, node);
if (vinst->vas_id == vasid) {
@@ -111,6 +125,17 @@ struct vas_instance *find_vas_instance(int vasid)
return NULL;
}
+int chip_to_vas_id(int chipid)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_to_chip_id(cpu) == chipid)
+ return per_cpu(cpu_vas_id, cpu);
+ }
+ return -1;
+}
+
static int vas_probe(struct platform_device *pdev)
{
return init_vas_instance(pdev);
@@ -134,6 +159,8 @@ static int __init vas_init(void)
int found = 0;
struct device_node *dn;
+ vas_init_dbgdir();
+
platform_driver_register(&vas_driver);
for_each_compatible_node(dn, NULL, "ibm,vas") {
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 38dee5d50f31..ae0100fd35bb 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -13,6 +13,8 @@
#include <linux/idr.h>
#include <asm/vas.h>
#include <linux/io.h>
+#include <linux/dcache.h>
+#include <linux/mutex.h>
/*
* Overview of Virtual Accelerator Switchboard (VAS).
@@ -106,8 +108,8 @@
*
* TODO: Needs tuning for per-process credits
*/
-#define VAS_WCREDS_MIN 16
-#define VAS_WCREDS_MAX ((64 << 10) - 1)
+#define VAS_RX_WCREDS_MAX ((64 << 10) - 1)
+#define VAS_TX_WCREDS_MAX ((4 << 10) - 1)
#define VAS_WCREDS_DEFAULT (1 << 10)
/*
@@ -259,6 +261,16 @@
#define VAS_NX_UTIL_ADDER PPC_BITMASK(32, 63)
/*
+ * VREG(x):
+ * Expand a register's short name (eg: LPID) into two parameters:
+ * - the register's short name in string form ("LPID"), and
+ * - the name of the macro (eg: VAS_LPID_OFFSET), defining the
+ * register's offset in the window context
+ */
+#define VREG_SFX(n, s) __stringify(n), VAS_##n##s
+#define VREG(r) VREG_SFX(r, _OFFSET)
+
+/*
* Local Notify Scope Control Register. (Receive windows only).
*/
enum vas_notify_scope {
@@ -307,6 +319,9 @@ struct vas_instance {
struct mutex mutex;
struct vas_window *rxwin[VAS_COP_TYPE_MAX];
struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+
+ char *dbgname;
+ struct dentry *dbgdir;
};
/*
@@ -322,6 +337,10 @@ struct vas_window {
void *hvwc_map; /* HV window context */
void *uwc_map; /* OS/User window context */
pid_t pid; /* Linux process id of owner */
+ int wcreds_max; /* Window credits */
+
+ char *dbgname;
+ struct dentry *dbgdir;
/* Fields applicable only to send windows */
void *paste_kaddr;
@@ -383,45 +402,23 @@ struct vas_winctx {
enum vas_notify_after_count notify_after_count;
};
-extern struct vas_instance *find_vas_instance(int vasid);
+extern struct mutex vas_mutex;
-/*
- * VREG(x):
- * Expand a register's short name (eg: LPID) into two parameters:
- * - the register's short name in string form ("LPID"), and
- * - the name of the macro (eg: VAS_LPID_OFFSET), defining the
- * register's offset in the window context
- */
-#define VREG_SFX(n, s) __stringify(n), VAS_##n##s
-#define VREG(r) VREG_SFX(r, _OFFSET)
-
-#ifdef vas_debug
-static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr)
-{
- pr_err("fault %d, notify %d, intr %d early %d\n",
- attr->fault_win, attr->notify_disable,
- attr->intr_disable, attr->notify_early);
-
- pr_err("rx_fifo_size %d, max value %d\n",
- attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX);
-}
+extern struct vas_instance *find_vas_instance(int vasid);
+extern void vas_init_dbgdir(void);
+extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
+extern void vas_window_init_dbgdir(struct vas_window *win);
+extern void vas_window_free_dbgdir(struct vas_window *win);
static inline void vas_log_write(struct vas_window *win, char *name,
void *regptr, u64 val)
{
if (val)
- pr_err("%swin #%d: %s reg %p, val 0x%016llx\n",
+ pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
win->tx_win ? "Tx" : "Rx", win->winid, name,
regptr, val);
}
-#else /* vas_debug */
-
-#define vas_log_write(win, name, reg, val)
-#define dump_rx_win_attr(attr)
-
-#endif /* vas_debug */
-
static inline void write_uwc_reg(struct vas_window *win, char *name,
s32 reg, u64 val)
{
@@ -450,18 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
return in_be64(win->hvwc_map+reg);
}
-#ifdef vas_debug
-
-static void print_fifo_msg_count(struct vas_window *txwin)
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * Bits Usage
+ * 0:7 VAS id (8 bits)
+ * 8:15 Unused, 0 (3 bits)
+ * 16:31 Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
{
- uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o);
- pr_devel("Winid %d, Msg count %llu\n", txwin->winid,
- (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
-}
-#else /* vas_debug */
+ u32 pswid = 0;
-#define print_fifo_msg_count(window)
+ pswid |= vasid << (31 - 7);
+ pswid |= winid;
-#endif /* vas_debug */
+ return pswid;
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+ if (vasid)
+ *vasid = pswid >> (31 - 7) & 0xFF;
+ if (winid)
+ *winid = pswid & 0xFFFF;
+}
#endif /* _VAS_H */