summaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/pseries
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig33
-rw-r--r--arch/powerpc/platforms/pseries/Makefile10
-rw-r--r--arch/powerpc/platforms/pseries/eeh.c1213
-rw-r--r--arch/powerpc/platforms/pseries/eeh_event.c155
-rw-r--r--arch/powerpc/platforms/pseries/hvCall.S131
-rw-r--r--arch/powerpc/platforms/pseries/hvconsole.c74
-rw-r--r--arch/powerpc/platforms/pseries/hvcserver.c251
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c606
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c546
-rw-r--r--arch/powerpc/platforms/pseries/nvram.c148
-rw-r--r--arch/powerpc/platforms/pseries/pci.c141
-rw-r--r--arch/powerpc/platforms/pseries/plpar_wrappers.h110
-rw-r--r--arch/powerpc/platforms/pseries/ras.c352
-rw-r--r--arch/powerpc/platforms/pseries/reconfig.c424
-rw-r--r--arch/powerpc/platforms/pseries/rtasd.c528
-rw-r--r--arch/powerpc/platforms/pseries/scanlog.c235
-rw-r--r--arch/powerpc/platforms/pseries/setup.c642
-rw-r--r--arch/powerpc/platforms/pseries/smp.c474
-rw-r--r--arch/powerpc/platforms/pseries/vio.c274
-rw-r--r--arch/powerpc/platforms/pseries/xics.c748
-rw-r--r--arch/powerpc/platforms/pseries/xics.h34
21 files changed, 7129 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
new file mode 100644
index 000000000000..e3fc3407bb1f
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -0,0 +1,33 @@
+
+config PPC_SPLPAR
+ depends on PPC_PSERIES
+ bool "Support for shared-processor logical partitions"
+ default n
+ help
+ Enabling this option will make the kernel run more efficiently
+ on logically-partitioned pSeries systems which use shared
+ processors, that is, which share physical processors between
+ two or more partitions.
+
+config HMT
+ bool "Hardware multithreading"
+ depends on SMP && PPC_PSERIES && BROKEN
+ help
+ This option enables hardware multithreading on RS64 cpus.
+ pSeries systems p620 and p660 have such a cpu type.
+
+config EEH
+ bool "PCI Extended Error Handling (EEH)" if EMBEDDED
+ depends on PPC_PSERIES
+ default y if !EMBEDDED
+
+config SCANLOG
+ tristate "Scanlog dump interface"
+ depends on RTAS_PROC && PPC_PSERIES
+
+config LPARCFG
+ tristate "LPAR Configuration Data"
+ depends on PPC_PSERIES || PPC_ISERIES
+ help
+ Provide system capacity information via human readable
+ <key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
new file mode 100644
index 000000000000..06d5ef501218
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -0,0 +1,10 @@
+obj-y := pci.o lpar.o hvCall.o nvram.o reconfig.o \
+ setup.o iommu.o ras.o rtasd.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_IBMVIO) += vio.o
+obj-$(CONFIG_XICS) += xics.o
+obj-$(CONFIG_SCANLOG) += scanlog.o
+obj-$(CONFIG_EEH) += eeh.o eeh_event.o
+
+obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
+obj-$(CONFIG_HVCS) += hvcserver.o
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
new file mode 100644
index 000000000000..c8d2a40dc5b4
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -0,0 +1,1213 @@
+/*
+ * eeh.c
+ * Copyright (C) 2001 Dave Engebretsen & Todd Inglett IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+#undef DEBUG
+
+/** Overview:
+ * EEH, or "Extended Error Handling" is a PCI bridge technology for
+ * dealing with PCI bus errors that can't be dealt with within the
+ * usual PCI framework, except by check-stopping the CPU. Systems
+ * that are designed for high-availability/reliability cannot afford
+ * to crash due to a "mere" PCI error, thus the need for EEH.
+ * An EEH-capable bridge operates by converting a detected error
+ * into a "slot freeze", taking the PCI adapter off-line, making
+ * the slot behave, from the OS'es point of view, as if the slot
+ * were "empty": all reads return 0xff's and all writes are silently
+ * ignored. EEH slot isolation events can be triggered by parity
+ * errors on the address or data busses (e.g. during posted writes),
+ * which in turn might be caused by low voltage on the bus, dust,
+ * vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ * Note, however, that one of the leading causes of EEH slot
+ * freeze events are buggy device drivers, buggy device microcode,
+ * or buggy device hardware. This is because any attempt by the
+ * device to bus-master data to a memory address that is not
+ * assigned to the device will trigger a slot freeze. (The idea
+ * is to prevent devices-gone-wild from corrupting system memory).
+ * Buggy hardware/drivers will have a miserable time co-existing
+ * with EEH.
+ *
+ * Ideally, a PCI device driver, when suspecting that an isolation
+ * event has occured (e.g. by reading 0xff's), will then ask EEH
+ * whether this is the case, and then take appropriate steps to
+ * reset the PCI slot, the PCI device, and then resume operations.
+ * However, until that day, the checking is done here, with the
+ * eeh_check_failure() routine embedded in the MMIO macros. If
+ * the slot is found to be isolated, an "EEH Event" is synthesized
+ * and sent out for processing.
+ */
+
+/* If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event has occurred, we assume it
+ * is broken and panic. This sets the threshold for how many read
+ * attempts we allow before panicking.
+ */
+#define EEH_MAX_FAILS 100000
+
+/* Misc forward declaraions */
+static void eeh_save_bars(struct pci_dev * pdev, struct pci_dn *pdn);
+
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+
+int eeh_subsystem_enabled;
+EXPORT_SYMBOL(eeh_subsystem_enabled);
+
+/* Lock to avoid races due to multiple reports of an error */
+static DEFINE_SPINLOCK(confirm_error_lock);
+
+/* Buffer for reporting slot-error-detail rtas calls */
+static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(slot_errbuf_lock);
+static int eeh_error_buf_size;
+
+/* System monitoring statistics */
+static DEFINE_PER_CPU(unsigned long, no_device);
+static DEFINE_PER_CPU(unsigned long, no_dn);
+static DEFINE_PER_CPU(unsigned long, no_cfg_addr);
+static DEFINE_PER_CPU(unsigned long, ignored_check);
+static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
+static DEFINE_PER_CPU(unsigned long, false_positives);
+static DEFINE_PER_CPU(unsigned long, ignored_failures);
+static DEFINE_PER_CPU(unsigned long, slot_resets);
+
+/**
+ * The pci address cache subsystem. This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range
+{
+ struct rb_node rb_node;
+ unsigned long addr_lo;
+ unsigned long addr_hi;
+ struct pci_dev *pcidev;
+ unsigned int flags;
+};
+
+static struct pci_io_addr_cache
+{
+ struct rb_root rb_root;
+ spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr)
+{
+ struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (addr < piar->addr_lo) {
+ n = n->rb_left;
+ } else {
+ if (addr > piar->addr_hi) {
+ n = n->rb_right;
+ } else {
+ pci_dev_get(piar->pcidev);
+ return piar->pcidev;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * pci_get_device_by_addr - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address. Be sure to pci_dev_put the device
+ * when finished. I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+static struct pci_dev *pci_get_device_by_addr(unsigned long addr)
+{
+ struct pci_dev *dev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ dev = __pci_get_device_by_addr(addr);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+ return dev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+ struct rb_node *n;
+ int cnt = 0;
+
+ n = rb_first(&cache->rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+ printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n",
+ (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+ piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+ cnt++;
+ n = rb_next(n);
+ }
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+ unsigned long ahi, unsigned int flags)
+{
+ struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct pci_io_addr_range *piar;
+
+ /* Walk tree, find a place to insert into tree */
+ while (*p) {
+ parent = *p;
+ piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+ if (ahi < piar->addr_lo) {
+ p = &parent->rb_left;
+ } else if (alo > piar->addr_hi) {
+ p = &parent->rb_right;
+ } else {
+ if (dev != piar->pcidev ||
+ alo != piar->addr_lo || ahi != piar->addr_hi) {
+ printk(KERN_WARNING "PIAR: overlapping address range\n");
+ }
+ return piar;
+ }
+ }
+ piar = (struct pci_io_addr_range *)kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+ if (!piar)
+ return NULL;
+
+ piar->addr_lo = alo;
+ piar->addr_hi = ahi;
+ piar->pcidev = dev;
+ piar->flags = flags;
+
+#ifdef DEBUG
+ printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n",
+ alo, ahi, pci_name (dev));
+#endif
+
+ rb_link_node(&piar->rb_node, parent, p);
+ rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+ return piar;
+}
+
+static void __pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+ int i;
+ int inserted = 0;
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn) {
+ printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev));
+ return;
+ }
+
+ /* Skip any devices for which EEH is not enabled. */
+ pdn = PCI_DN(dn);
+ if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
+ pdn->eeh_mode & EEH_MODE_NOCHECK) {
+#ifdef DEBUG
+ printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n",
+ pci_name(dev), pdn->node->full_name);
+#endif
+ return;
+ }
+
+ /* The cache holds a reference to the device... */
+ pci_dev_get(dev);
+
+ /* Walk resources on this device, poke them into the tree */
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ unsigned long start = pci_resource_start(dev,i);
+ unsigned long end = pci_resource_end(dev,i);
+ unsigned int flags = pci_resource_flags(dev,i);
+
+ /* We are interested only bus addresses, not dma or other stuff */
+ if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+ continue;
+ if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+ continue;
+ pci_addr_cache_insert(dev, start, end, flags);
+ inserted = 1;
+ }
+
+ /* If there was nothing to add, the cache has no reference... */
+ if (!inserted)
+ pci_dev_put(dev);
+}
+
+/**
+ * pci_addr_cache_insert_device - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+static void pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __pci_addr_cache_insert_device(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+ struct rb_node *n;
+ int removed = 0;
+
+restart:
+ n = rb_first(&pci_io_addr_cache_root.rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (piar->pcidev == dev) {
+ rb_erase(n, &pci_io_addr_cache_root.rb_root);
+ removed = 1;
+ kfree(piar);
+ goto restart;
+ }
+ n = rb_next(n);
+ }
+
+ /* The cache no longer holds its reference to this device... */
+ if (removed)
+ pci_dev_put(dev);
+}
+
+/**
+ * pci_addr_cache_remove_device - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+static void pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __pci_addr_cache_remove_device(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * pci_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses. This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scaned for devices (after all device resources are known).
+ */
+void __init pci_addr_cache_build(void)
+{
+ struct device_node *dn;
+ struct pci_dev *dev = NULL;
+
+ if (!eeh_subsystem_enabled)
+ return;
+
+ spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ /* Ignore PCI bridges ( XXX why ??) */
+ if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) {
+ continue;
+ }
+ pci_addr_cache_insert_device(dev);
+
+ /* Save the BAR's; firmware doesn't restore these after EEH reset */
+ dn = pci_device_to_OF_node(dev);
+ eeh_save_bars(dev, PCI_DN(dn));
+ }
+
+#ifdef DEBUG
+ /* Verify tree built up above, echo back the list of addrs. */
+ pci_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
+
+/* --------------------------------------------------------------- */
+/* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */
+
+void eeh_slot_error_detail (struct pci_dn *pdn, int severity)
+{
+ unsigned long flags;
+ int rc;
+
+ /* Log the error with the rtas logger */
+ spin_lock_irqsave(&slot_errbuf_lock, flags);
+ memset(slot_errbuf, 0, eeh_error_buf_size);
+
+ rc = rtas_call(ibm_slot_error_detail,
+ 8, 1, NULL, pdn->eeh_config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid), NULL, 0,
+ virt_to_phys(slot_errbuf),
+ eeh_error_buf_size,
+ severity);
+
+ if (rc == 0)
+ log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+ spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+}
+
+/**
+ * read_slot_reset_state - Read the reset state of a device node's slot
+ * @dn: device node to read
+ * @rets: array to return results in
+ */
+static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
+{
+ int token, outputs;
+
+ if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
+ token = ibm_read_slot_reset_state2;
+ outputs = 4;
+ } else {
+ token = ibm_read_slot_reset_state;
+ rets[2] = 0; /* fake PE Unavailable info */
+ outputs = 3;
+ }
+
+ return rtas_call(token, 3, outputs, rets, pdn->eeh_config_addr,
+ BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid));
+}
+
+/**
+ * eeh_token_to_phys - convert EEH address token to phys address
+ * @token i/o token, should be address in the form 0xA....
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+ pte_t *ptep;
+ unsigned long pa;
+
+ ptep = find_linux_pte(init_mm.pgd, token);
+ if (!ptep)
+ return token;
+ pa = pte_pfn(*ptep) << PAGE_SHIFT;
+
+ return pa | (token & (PAGE_SIZE-1));
+}
+
+/**
+ * Return the "partitionable endpoint" (pe) under which this device lies
+ */
+static struct device_node * find_device_pe(struct device_node *dn)
+{
+ while ((dn->parent) && PCI_DN(dn->parent) &&
+ (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
+ dn = dn->parent;
+ }
+ return dn;
+}
+
+/** Mark all devices that are peers of this device as failed.
+ * Mark the device driver too, so that it can see the failure
+ * immediately; this is critical, since some drivers poll
+ * status registers in interrupts ... If a driver is polling,
+ * and the slot is frozen, then the driver can deadlock in
+ * an interrupt context, which is bad.
+ */
+
+static void __eeh_mark_slot (struct device_node *dn, int mode_flag)
+{
+ while (dn) {
+ if (PCI_DN(dn)) {
+ PCI_DN(dn)->eeh_mode |= mode_flag;
+
+ if (dn->child)
+ __eeh_mark_slot (dn->child, mode_flag);
+ }
+ dn = dn->sibling;
+ }
+}
+
+void eeh_mark_slot (struct device_node *dn, int mode_flag)
+{
+ dn = find_device_pe (dn);
+ PCI_DN(dn)->eeh_mode |= mode_flag;
+ __eeh_mark_slot (dn->child, mode_flag);
+}
+
+static void __eeh_clear_slot (struct device_node *dn, int mode_flag)
+{
+ while (dn) {
+ if (PCI_DN(dn)) {
+ PCI_DN(dn)->eeh_mode &= ~mode_flag;
+ PCI_DN(dn)->eeh_check_count = 0;
+ if (dn->child)
+ __eeh_clear_slot (dn->child, mode_flag);
+ }
+ dn = dn->sibling;
+ }
+}
+
+void eeh_clear_slot (struct device_node *dn, int mode_flag)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&confirm_error_lock, flags);
+ dn = find_device_pe (dn);
+ PCI_DN(dn)->eeh_mode &= ~mode_flag;
+ PCI_DN(dn)->eeh_check_count = 0;
+ __eeh_clear_slot (dn->child, mode_flag);
+ spin_unlock_irqrestore(&confirm_error_lock, flags);
+}
+
+/**
+ * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
+ * @dn device node
+ * @dev pci device, if known
+ *
+ * Check for an EEH failure for the given device node. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze. This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a slot isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
+{
+ int ret;
+ int rets[3];
+ unsigned long flags;
+ struct pci_dn *pdn;
+ int rc = 0;
+
+ __get_cpu_var(total_mmio_ffs)++;
+
+ if (!eeh_subsystem_enabled)
+ return 0;
+
+ if (!dn) {
+ __get_cpu_var(no_dn)++;
+ return 0;
+ }
+ pdn = PCI_DN(dn);
+
+ /* Access to IO BARs might get this far and still not want checking. */
+ if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
+ pdn->eeh_mode & EEH_MODE_NOCHECK) {
+ __get_cpu_var(ignored_check)++;
+#ifdef DEBUG
+ printk ("EEH:ignored check (%x) for %s %s\n",
+ pdn->eeh_mode, pci_name (dev), dn->full_name);
+#endif
+ return 0;
+ }
+
+ if (!pdn->eeh_config_addr) {
+ __get_cpu_var(no_cfg_addr)++;
+ return 0;
+ }
+
+ /* If we already have a pending isolation event for this
+ * slot, we know it's bad already, we don't need to check.
+ * Do this checking under a lock; as multiple PCI devices
+ * in one slot might report errors simultaneously, and we
+ * only want one error recovery routine running.
+ */
+ spin_lock_irqsave(&confirm_error_lock, flags);
+ rc = 1;
+ if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
+ pdn->eeh_check_count ++;
+ if (pdn->eeh_check_count >= EEH_MAX_FAILS) {
+ printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
+ pdn->eeh_check_count);
+ dump_stack();
+
+ /* re-read the slot reset state */
+ if (read_slot_reset_state(pdn, rets) != 0)
+ rets[0] = -1; /* reset state unknown */
+
+ /* If we are here, then we hit an infinite loop. Stop. */
+ panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev));
+ }
+ goto dn_unlock;
+ }
+
+ /*
+ * Now test for an EEH failure. This is VERY expensive.
+ * Note that the eeh_config_addr may be a parent device
+ * in the case of a device behind a bridge, or it may be
+ * function zero of a multi-function device.
+ * In any case they must share a common PHB.
+ */
+ ret = read_slot_reset_state(pdn, rets);
+
+ /* If the call to firmware failed, punt */
+ if (ret != 0) {
+ printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
+ ret, dn->full_name);
+ __get_cpu_var(false_positives)++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ /* If EEH is not supported on this device, punt. */
+ if (rets[1] != 1) {
+ printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
+ ret, dn->full_name);
+ __get_cpu_var(false_positives)++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ /* If not the kind of error we know about, punt. */
+ if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
+ __get_cpu_var(false_positives)++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ /* Note that config-io to empty slots may fail;
+ * we recognize empty because they don't have children. */
+ if ((rets[0] == 5) && (dn->child == NULL)) {
+ __get_cpu_var(false_positives)++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ __get_cpu_var(slot_resets)++;
+
+ /* Avoid repeated reports of this failure, including problems
+ * with other functions on this device, and functions under
+ * bridges. */
+ eeh_mark_slot (dn, EEH_MODE_ISOLATED);
+ spin_unlock_irqrestore(&confirm_error_lock, flags);
+
+ eeh_send_failure_event (dn, dev, rets[0], rets[2]);
+
+ /* Most EEH events are due to device driver bugs. Having
+ * a stack trace will help the device-driver authors figure
+ * out what happened. So print that out. */
+ if (rets[0] != 5) dump_stack();
+ return 1;
+
+dn_unlock:
+ spin_unlock_irqrestore(&confirm_error_lock, flags);
+ return rc;
+}
+
+EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
+
+/**
+ * eeh_check_failure - check if all 1's data is due to EEH slot freeze
+ * @token i/o token, should be address in the form 0xA....
+ * @val value, should be all 1's (XXX why do we need this arg??)
+ *
+ * Check for an EEH failure at the given token address. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event. This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
+{
+ unsigned long addr;
+ struct pci_dev *dev;
+ struct device_node *dn;
+
+ /* Finding the phys addr + pci device; this is pretty quick. */
+ addr = eeh_token_to_phys((unsigned long __force) token);
+ dev = pci_get_device_by_addr(addr);
+ if (!dev) {
+ __get_cpu_var(no_device)++;
+ return val;
+ }
+
+ dn = pci_device_to_OF_node(dev);
+ eeh_dn_check_failure (dn, dev);
+
+ pci_dev_put(dev);
+ return val;
+}
+
+EXPORT_SYMBOL(eeh_check_failure);
+
+/* ------------------------------------------------------------- */
+/* The code below deals with error recovery */
+
+/** Return negative value if a permanent error, else return
+ * a number of milliseconds to wait until the PCI slot is
+ * ready to be used.
+ */
+static int
+eeh_slot_availability(struct pci_dn *pdn)
+{
+ int rc;
+ int rets[3];
+
+ rc = read_slot_reset_state(pdn, rets);
+
+ if (rc) return rc;
+
+ if (rets[1] == 0) return -1; /* EEH is not supported */
+ if (rets[0] == 0) return 0; /* Oll Korrect */
+ if (rets[0] == 5) {
+ if (rets[2] == 0) return -1; /* permanently unavailable */
+ return rets[2]; /* number of millisecs to wait */
+ }
+ return -1;
+}
+
+/** rtas_pci_slot_reset raises/lowers the pci #RST line
+ * state: 1/0 to raise/lower the #RST
+ *
+ * Clear the EEH-frozen condition on a slot. This routine
+ * asserts the PCI #RST line if the 'state' argument is '1',
+ * and drops the #RST line if 'state is '0'. This routine is
+ * safe to call in an interrupt context.
+ *
+ */
+
+static void
+rtas_pci_slot_reset(struct pci_dn *pdn, int state)
+{
+ int rc;
+
+ BUG_ON (pdn==NULL);
+
+ if (!pdn->phb) {
+ printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n",
+ pdn->node->full_name);
+ return;
+ }
+
+ rc = rtas_call(ibm_set_slot_reset,4,1, NULL,
+ pdn->eeh_config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid),
+ state);
+ if (rc) {
+ printk (KERN_WARNING "EEH: Unable to reset the failed slot, (%d) #RST=%d dn=%s\n",
+ rc, state, pdn->node->full_name);
+ return;
+ }
+}
+
+/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
+ * dn -- device node to be reset.
+ */
+
+void
+rtas_set_slot_reset(struct pci_dn *pdn)
+{
+ int i, rc;
+
+ rtas_pci_slot_reset (pdn, 1);
+
+ /* The PCI bus requires that the reset be held high for at least
+ * a 100 milliseconds. We wait a bit longer 'just in case'. */
+
+#define PCI_BUS_RST_HOLD_TIME_MSEC 250
+ msleep (PCI_BUS_RST_HOLD_TIME_MSEC);
+
+ /* We might get hit with another EEH freeze as soon as the
+ * pci slot reset line is dropped. Make sure we don't miss
+ * these, and clear the flag now. */
+ eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED);
+
+ rtas_pci_slot_reset (pdn, 0);
+
+ /* After a PCI slot has been reset, the PCI Express spec requires
+ * a 1.5 second idle time for the bus to stabilize, before starting
+ * up traffic. */
+#define PCI_BUS_SETTLE_TIME_MSEC 1800
+ msleep (PCI_BUS_SETTLE_TIME_MSEC);
+
+ /* Now double check with the firmware to make sure the device is
+ * ready to be used; if not, wait for recovery. */
+ for (i=0; i<10; i++) {
+ rc = eeh_slot_availability (pdn);
+ if (rc <= 0) break;
+
+ msleep (rc+100);
+ }
+}
+
+/* ------------------------------------------------------- */
+/** Save and restore of PCI BARs
+ *
+ * Although firmware will set up BARs during boot, it doesn't
+ * set up device BAR's after a device reset, although it will,
+ * if requested, set up bridge configuration. Thus, we need to
+ * configure the PCI devices ourselves.
+ */
+
+/**
+ * __restore_bars - Restore the Base Address Registers
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static inline void __restore_bars (struct pci_dn *pdn)
+{
+ int i;
+
+ if (NULL==pdn->phb) return;
+ for (i=4; i<10; i++) {
+ rtas_write_config(pdn, i*4, 4, pdn->config_space[i]);
+ }
+
+ /* 12 == Expansion ROM Address */
+ rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]);
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)])
+
+ rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1,
+ SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+
+ rtas_write_config (pdn, PCI_LATENCY_TIMER, 1,
+ SAVED_BYTE(PCI_LATENCY_TIMER));
+
+ /* max latency, min grant, interrupt pin and line */
+ rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]);
+}
+
+/**
+ * eeh_restore_bars - restore the PCI config space info
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_restore_bars(struct pci_dn *pdn)
+{
+ struct device_node *dn;
+ if (!pdn)
+ return;
+
+ if (! pdn->eeh_is_bridge)
+ __restore_bars (pdn);
+
+ dn = pdn->node->child;
+ while (dn) {
+ eeh_restore_bars (PCI_DN(dn));
+ dn = dn->sibling;
+ }
+}
+
+/**
+ * eeh_save_bars - save device bars
+ *
+ * Save the values of the device bars. Unlike the restore
+ * routine, this routine is *not* recursive. This is because
+ * PCI devices are added individuallly; but, for the restore,
+ * an entire slot is reset at a time.
+ */
+static void eeh_save_bars(struct pci_dev * pdev, struct pci_dn *pdn)
+{
+ int i;
+
+ if (!pdev || !pdn )
+ return;
+
+ for (i = 0; i < 16; i++)
+ pci_read_config_dword(pdev, i * 4, &pdn->config_space[i]);
+
+ if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
+ pdn->eeh_is_bridge = 1;
+}
+
+void
+rtas_configure_bridge(struct pci_dn *pdn)
+{
+ int token = rtas_token ("ibm,configure-bridge");
+ int rc;
+
+ if (token == RTAS_UNKNOWN_SERVICE)
+ return;
+ rc = rtas_call(token,3,1, NULL,
+ pdn->eeh_config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid));
+ if (rc) {
+ printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n",
+ rc, pdn->node->full_name);
+ }
+}
+
+/* ------------------------------------------------------------- */
+/* The code below deals with enabling EEH for devices during the
+ * early boot sequence. EEH must be enabled before any PCI probing
+ * can be done.
+ */
+
+#define EEH_ENABLE 1
+
+struct eeh_early_enable_info {
+ unsigned int buid_hi;
+ unsigned int buid_lo;
+};
+
+/* Enable eeh for the given device node. */
+static void *early_enable_eeh(struct device_node *dn, void *data)
+{
+ struct eeh_early_enable_info *info = data;
+ int ret;
+ char *status = get_property(dn, "status", NULL);
+ u32 *class_code = (u32 *)get_property(dn, "class-code", NULL);
+ u32 *vendor_id = (u32 *)get_property(dn, "vendor-id", NULL);
+ u32 *device_id = (u32 *)get_property(dn, "device-id", NULL);
+ u32 *regs;
+ int enable;
+ struct pci_dn *pdn = PCI_DN(dn);
+
+ pdn->eeh_mode = 0;
+ pdn->eeh_check_count = 0;
+ pdn->eeh_freeze_count = 0;
+
+ if (status && strcmp(status, "ok") != 0)
+ return NULL; /* ignore devices with bad status */
+
+ /* Ignore bad nodes. */
+ if (!class_code || !vendor_id || !device_id)
+ return NULL;
+
+ /* There is nothing to check on PCI to ISA bridges */
+ if (dn->type && !strcmp(dn->type, "isa")) {
+ pdn->eeh_mode |= EEH_MODE_NOCHECK;
+ return NULL;
+ }
+
+ /*
+ * Now decide if we are going to "Disable" EEH checking
+ * for this device. We still run with the EEH hardware active,
+ * but we won't be checking for ff's. This means a driver
+ * could return bad data (very bad!), an interrupt handler could
+ * hang waiting on status bits that won't change, etc.
+ * But there are a few cases like display devices that make sense.
+ */
+ enable = 1; /* i.e. we will do checking */
+ if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY)
+ enable = 0;
+
+ if (!enable)
+ pdn->eeh_mode |= EEH_MODE_NOCHECK;
+
+ /* Ok... see if this device supports EEH. Some do, some don't,
+ * and the only way to find out is to check each and every one. */
+ regs = (u32 *)get_property(dn, "reg", NULL);
+ if (regs) {
+ /* First register entry is addr (00BBSS00) */
+ /* Try to enable eeh */
+ ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+ regs[0], info->buid_hi, info->buid_lo,
+ EEH_ENABLE);
+
+ if (ret == 0) {
+ eeh_subsystem_enabled = 1;
+ pdn->eeh_mode |= EEH_MODE_SUPPORTED;
+ pdn->eeh_config_addr = regs[0];
+#ifdef DEBUG
+ printk(KERN_DEBUG "EEH: %s: eeh enabled\n", dn->full_name);
+#endif
+ } else {
+
+ /* This device doesn't support EEH, but it may have an
+ * EEH parent, in which case we mark it as supported. */
+ if (dn->parent && PCI_DN(dn->parent)
+ && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
+ /* Parent supports EEH. */
+ pdn->eeh_mode |= EEH_MODE_SUPPORTED;
+ pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr;
+ return NULL;
+ }
+ }
+ } else {
+ printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",
+ dn->full_name);
+ }
+
+ return NULL;
+}
+
+/*
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check. If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+void __init eeh_init(void)
+{
+ struct device_node *phb, *np;
+ struct eeh_early_enable_info info;
+
+ spin_lock_init(&confirm_error_lock);
+ spin_lock_init(&slot_errbuf_lock);
+
+ np = of_find_node_by_path("/rtas");
+ if (np == NULL)
+ return;
+
+ ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
+ ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
+ ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
+ ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
+ ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
+
+ if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)
+ return;
+
+ eeh_error_buf_size = rtas_token("rtas-error-log-max");
+ if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
+ eeh_error_buf_size = 1024;
+ }
+ if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
+ printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated "
+ "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
+ eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
+ }
+
+ /* Enable EEH for all adapters. Note that eeh requires buid's */
+ for (phb = of_find_node_by_name(NULL, "pci"); phb;
+ phb = of_find_node_by_name(phb, "pci")) {
+ unsigned long buid;
+
+ buid = get_phb_buid(phb);
+ if (buid == 0 || PCI_DN(phb) == NULL)
+ continue;
+
+ info.buid_lo = BUID_LO(buid);
+ info.buid_hi = BUID_HI(buid);
+ traverse_pci_devices(phb, early_enable_eeh, &info);
+ }
+
+ if (eeh_subsystem_enabled)
+ printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");
+ else
+ printk(KERN_WARNING "EEH: No capable adapters found\n");
+}
+
+/**
+ * eeh_add_device_early - enable EEH for the indicated device_node
+ * @dn: device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+void eeh_add_device_early(struct device_node *dn)
+{
+ struct pci_controller *phb;
+ struct eeh_early_enable_info info;
+
+ if (!dn || !PCI_DN(dn))
+ return;
+ phb = PCI_DN(dn)->phb;
+ if (NULL == phb || 0 == phb->buid) {
+ printk(KERN_WARNING "EEH: Expected buid but found none for %s\n",
+ dn->full_name);
+ dump_stack();
+ return;
+ }
+
+ info.buid_hi = BUID_HI(phb->buid);
+ info.buid_lo = BUID_LO(phb->buid);
+ early_enable_eeh(dn, &info);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_early);
+
+/**
+ * eeh_add_device_late - perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+void eeh_add_device_late(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+
+ if (!dev || !eeh_subsystem_enabled)
+ return;
+
+#ifdef DEBUG
+ printk(KERN_DEBUG "EEH: adding device %s\n", pci_name(dev));
+#endif
+
+ pci_dev_get (dev);
+ dn = pci_device_to_OF_node(dev);
+ pdn = PCI_DN(dn);
+ pdn->pcidev = dev;
+
+ pci_addr_cache_insert_device (dev);
+ eeh_save_bars(dev, pdn);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_late);
+
+/**
+ * eeh_remove_device - undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ *
+ * This routine should be when a device is removed from a running
+ * system (e.g. by hotplug or dlpar).
+ */
+void eeh_remove_device(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ if (!dev || !eeh_subsystem_enabled)
+ return;
+
+ /* Unregister the device with the EEH/PCI address search system */
+#ifdef DEBUG
+ printk(KERN_DEBUG "EEH: remove device %s\n", pci_name(dev));
+#endif
+ pci_addr_cache_remove_device(dev);
+
+ dn = pci_device_to_OF_node(dev);
+ PCI_DN(dn)->pcidev = NULL;
+ pci_dev_put (dev);
+}
+EXPORT_SYMBOL_GPL(eeh_remove_device);
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+ unsigned int cpu;
+ unsigned long ffs = 0, positives = 0, failures = 0;
+ unsigned long resets = 0;
+ unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0;
+
+ for_each_cpu(cpu) {
+ ffs += per_cpu(total_mmio_ffs, cpu);
+ positives += per_cpu(false_positives, cpu);
+ failures += per_cpu(ignored_failures, cpu);
+ resets += per_cpu(slot_resets, cpu);
+ no_dev += per_cpu(no_device, cpu);
+ no_dn += per_cpu(no_dn, cpu);
+ no_cfg += per_cpu(no_cfg_addr, cpu);
+ no_check += per_cpu(ignored_check, cpu);
+ }
+
+ if (0 == eeh_subsystem_enabled) {
+ seq_printf(m, "EEH Subsystem is globally disabled\n");
+ seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs);
+ } else {
+ seq_printf(m, "EEH Subsystem is enabled\n");
+ seq_printf(m,
+ "no device=%ld\n"
+ "no device node=%ld\n"
+ "no config address=%ld\n"
+ "check not wanted=%ld\n"
+ "eeh_total_mmio_ffs=%ld\n"
+ "eeh_false_positives=%ld\n"
+ "eeh_ignored_failures=%ld\n"
+ "eeh_slot_resets=%ld\n",
+ no_dev, no_dn, no_cfg, no_check,
+ ffs, positives, failures, resets);
+ }
+
+ return 0;
+}
+
+static int proc_eeh_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_eeh_show, NULL);
+}
+
+static struct file_operations proc_eeh_operations = {
+ .open = proc_eeh_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init eeh_init_proc(void)
+{
+ struct proc_dir_entry *e;
+
+ if (platform_is_pseries()) {
+ e = create_proc_entry("ppc64/eeh", 0, NULL);
+ if (e)
+ e->proc_fops = &proc_eeh_operations;
+ }
+
+ return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
new file mode 100644
index 000000000000..92497333c2b6
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -0,0 +1,155 @@
+/*
+ * eeh_event.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <asm/eeh_event.h>
+
+/** Overview:
+ * EEH error states may be detected within exception handlers;
+ * however, the recovery processing needs to occur asynchronously
+ * in a normal kernel context and not an interrupt context.
+ * This pair of routines creates an event and queues it onto a
+ * work-queue, where a worker thread can drive recovery.
+ */
+
+/* EEH event workqueue setup. */
+static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(eeh_eventlist);
+static void eeh_thread_launcher(void *);
+DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
+
+/**
+ * eeh_panic - call panic() for an eeh event that cannot be handled.
+ * The philosophy of this routine is that it is better to panic and
+ * halt the OS than it is to risk possible data corruption by
+ * oblivious device drivers that don't know better.
+ *
+ * @dev pci device that had an eeh event
+ * @reset_state current reset state of the device slot
+ */
+static void eeh_panic(struct pci_dev *dev, int reset_state)
+{
+ /*
+ * Since the panic_on_oops sysctl is used to halt the system
+ * in light of potential corruption, we can use it here.
+ */
+ if (panic_on_oops) {
+ panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
+ pci_name(dev));
+ }
+ else {
+ printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
+ reset_state, pci_name(dev));
+ }
+}
+
+/**
+ * eeh_event_handler - dispatch EEH events. The detection of a frozen
+ * slot can occur inside an interrupt, where it can be hard to do
+ * anything about it. The goal of this routine is to pull these
+ * detection events out of the context of the interrupt handler, and
+ * re-dispatch them for processing at a later time in a normal context.
+ *
+ * @dummy - unused
+ */
+static int eeh_event_handler(void * dummy)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+
+ daemonize ("eehd");
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ event = NULL;
+ if (!list_empty(&eeh_eventlist)) {
+ event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+ list_del(&event->list);
+ }
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+ if (event == NULL)
+ break;
+
+ printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+ pci_name(event->dev));
+
+ eeh_panic (event->dev, event->state);
+
+ kfree(event);
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_thread_launcher
+ *
+ * @dummy - unused
+ */
+static void eeh_thread_launcher(void *dummy)
+{
+ if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
+ printk(KERN_ERR "Failed to start EEH daemon\n");
+}
+
+/**
+ * eeh_send_failure_event - generate a PCI error event
+ * @dev pci device
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event (struct device_node *dn,
+ struct pci_dev *dev,
+ int state,
+ int time_unavail)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+
+ event = kmalloc(sizeof(*event), GFP_ATOMIC);
+ if (event == NULL) {
+ printk (KERN_ERR "EEH: out of memory, event not handled\n");
+ return 1;
+ }
+
+ if (dev)
+ pci_dev_get(dev);
+
+ event->dn = dn;
+ event->dev = dev;
+ event->state = state;
+ event->time_unavail = time_unavail;
+
+ /* We may or may not be called in an interrupt context */
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ list_add(&event->list, &eeh_eventlist);
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+ schedule_work(&eeh_event_wq);
+
+ return 0;
+}
+
+/********************** END OF FILE ******************************/
diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
new file mode 100644
index 000000000000..176e8da76466
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall.S
@@ -0,0 +1,131 @@
+/*
+ * arch/ppc64/kernel/pSeries_hvCall.S
+ *
+ * This file contains the generic code to perform a call to the
+ * pSeries LPAR hypervisor.
+ * NOTE: this file will go away when we move to inline this work.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/hvcall.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+
+#define STK_PARM(i) (48 + ((i)-3)*8)
+
+ .text
+
+/* long plpar_hcall(unsigned long opcode, R3
+ unsigned long arg1, R4
+ unsigned long arg2, R5
+ unsigned long arg3, R6
+ unsigned long arg4, R7
+ unsigned long *out1, R8
+ unsigned long *out2, R9
+ unsigned long *out3); R10
+ */
+_GLOBAL(plpar_hcall)
+ HMT_MEDIUM
+
+ mfcr r0
+
+ std r8,STK_PARM(r8)(r1) /* Save out ptrs */
+ std r9,STK_PARM(r9)(r1)
+ std r10,STK_PARM(r10)(r1)
+
+ stw r0,8(r1)
+
+ HVSC /* invoke the hypervisor */
+
+ lwz r0,8(r1)
+
+ ld r8,STK_PARM(r8)(r1) /* Fetch r4-r6 ret args */
+ ld r9,STK_PARM(r9)(r1)
+ ld r10,STK_PARM(r10)(r1)
+ std r4,0(r8)
+ std r5,0(r9)
+ std r6,0(r10)
+
+ mtcrf 0xff,r0
+ blr /* return r3 = status */
+
+
+/* Simple interface with no output values (other than status) */
+_GLOBAL(plpar_hcall_norets)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ HVSC /* invoke the hypervisor */
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+ blr /* return r3 = status */
+
+
+/* long plpar_hcall_8arg_2ret(unsigned long opcode, R3
+ unsigned long arg1, R4
+ unsigned long arg2, R5
+ unsigned long arg3, R6
+ unsigned long arg4, R7
+ unsigned long arg5, R8
+ unsigned long arg6, R9
+ unsigned long arg7, R10
+ unsigned long arg8, 112(R1)
+ unsigned long *out1); 120(R1)
+ */
+_GLOBAL(plpar_hcall_8arg_2ret)
+ HMT_MEDIUM
+
+ mfcr r0
+ ld r11,STK_PARM(r11)(r1) /* put arg8 in R11 */
+ stw r0,8(r1)
+
+ HVSC /* invoke the hypervisor */
+
+ lwz r0,8(r1)
+ ld r10,STK_PARM(r12)(r1) /* Fetch r4 ret arg */
+ std r4,0(r10)
+ mtcrf 0xff,r0
+ blr /* return r3 = status */
+
+
+/* long plpar_hcall_4out(unsigned long opcode, R3
+ unsigned long arg1, R4
+ unsigned long arg2, R5
+ unsigned long arg3, R6
+ unsigned long arg4, R7
+ unsigned long *out1, R8
+ unsigned long *out2, R9
+ unsigned long *out3, R10
+ unsigned long *out4); 112(R1)
+ */
+_GLOBAL(plpar_hcall_4out)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ std r8,STK_PARM(r8)(r1) /* Save out ptrs */
+ std r9,STK_PARM(r9)(r1)
+ std r10,STK_PARM(r10)(r1)
+
+ HVSC /* invoke the hypervisor */
+
+ lwz r0,8(r1)
+
+ ld r8,STK_PARM(r8)(r1) /* Fetch r4-r7 ret args */
+ ld r9,STK_PARM(r9)(r1)
+ ld r10,STK_PARM(r10)(r1)
+ ld r11,STK_PARM(r11)(r1)
+ std r4,0(r8)
+ std r5,0(r9)
+ std r6,0(r10)
+ std r7,0(r11)
+
+ mtcrf 0xff,r0
+ blr /* return r3 = status */
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
new file mode 100644
index 000000000000..138e128a3886
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -0,0 +1,74 @@
+/*
+ * hvconsole.c
+ * Copyright (C) 2004 Hollis Blanchard, IBM Corporation
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Additional Author(s):
+ * Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * LPAR console support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/hvcall.h>
+#include <asm/hvconsole.h>
+
+/**
+ * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
+ * @vtermno: The vtermno or unit_address of the adapter from which to fetch the
+ * data.
+ * @buf: The character buffer into which to put the character data fetched from
+ * firmware.
+ * @count: not used?
+ */
+int hvc_get_chars(uint32_t vtermno, char *buf, int count)
+{
+ unsigned long got;
+
+ if (plpar_hcall(H_GET_TERM_CHAR, vtermno, 0, 0, 0, &got,
+ (unsigned long *)buf, (unsigned long *)buf+1) == H_Success)
+ return got;
+ return 0;
+}
+
+EXPORT_SYMBOL(hvc_get_chars);
+
+
+/**
+ * hvc_put_chars: send characters to firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which the data
+ * originated.
+ * @buf: The character buffer that contains the character data to send to
+ * firmware.
+ * @count: Send this number of characters.
+ */
+int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
+{
+ unsigned long *lbuf = (unsigned long *) buf;
+ long ret;
+
+ ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0],
+ lbuf[1]);
+ if (ret == H_Success)
+ return count;
+ if (ret == H_Busy)
+ return 0;
+ return -EIO;
+}
+
+EXPORT_SYMBOL(hvc_put_chars);
diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c
new file mode 100644
index 000000000000..4d584172055a
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvcserver.c
@@ -0,0 +1,251 @@
+/*
+ * hvcserver.c
+ * Copyright (C) 2004 Ryan S Arnold, IBM Corporation
+ *
+ * PPC64 virtual I/O console server support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <asm/hvcall.h>
+#include <asm/hvcserver.h>
+#include <asm/io.h>
+
+#define HVCS_ARCH_VERSION "1.0.0"
+
+MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
+MODULE_DESCRIPTION("IBM hvcs ppc64 API");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HVCS_ARCH_VERSION);
+
+/*
+ * Convert arch specific return codes into relevant errnos. The hvcs
+ * functions aren't performance sensitive, so this conversion isn't an
+ * issue.
+ */
+int hvcs_convert(long to_convert)
+{
+ switch (to_convert) {
+ case H_Success:
+ return 0;
+ case H_Parameter:
+ return -EINVAL;
+ case H_Hardware:
+ return -EIO;
+ case H_Busy:
+ case H_LongBusyOrder1msec:
+ case H_LongBusyOrder10msec:
+ case H_LongBusyOrder100msec:
+ case H_LongBusyOrder1sec:
+ case H_LongBusyOrder10sec:
+ case H_LongBusyOrder100sec:
+ return -EBUSY;
+ case H_Function: /* fall through */
+ default:
+ return -EPERM;
+ }
+}
+
+/**
+ * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
+ * @head: list_head pointer for an allocated list of partner info structs to
+ * free.
+ *
+ * This function is used to free the partner info list that was returned by
+ * calling hvcs_get_partner_info().
+ */
+int hvcs_free_partner_info(struct list_head *head)
+{
+ struct hvcs_partner_info *pi;
+ struct list_head *element;
+
+ if (!head)
+ return -EINVAL;
+
+ while (!list_empty(head)) {
+ element = head->next;
+ pi = list_entry(element, struct hvcs_partner_info, node);
+ list_del(element);
+ kfree(pi);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(hvcs_free_partner_info);
+
+/* Helper function for hvcs_get_partner_info */
+int hvcs_next_partner(uint32_t unit_address,
+ unsigned long last_p_partition_ID,
+ unsigned long last_p_unit_address, unsigned long *pi_buff)
+
+{
+ long retval;
+ retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
+ last_p_partition_ID,
+ last_p_unit_address, virt_to_phys(pi_buff));
+ return hvcs_convert(retval);
+}
+
+/**
+ * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
+ * @unit_address: The unit_address of the vty-server adapter for which this
+ * function is fetching partner info.
+ * @head: An initialized list_head pointer to an empty list to use to return the
+ * list of partner info fetched from the hypervisor to the caller.
+ * @pi_buff: A page sized buffer pre-allocated prior to calling this function
+ * that is to be used to be used by firmware as an iterator to keep track
+ * of the partner info retrieval.
+ *
+ * This function returns non-zero on success, or if there is no partner info.
+ *
+ * The pi_buff is pre-allocated prior to calling this function because this
+ * function may be called with a spin_lock held and kmalloc of a page is not
+ * recommended as GFP_ATOMIC.
+ *
+ * The first long of this buffer is used to store a partner unit address. The
+ * second long is used to store a partner partition ID and starting at
+ * pi_buff[2] is the 79 character Converged Location Code (diff size than the
+ * unsigned longs, hence the casting mumbo jumbo you see later).
+ *
+ * Invocation of this function should always be followed by an invocation of
+ * hvcs_free_partner_info() using a pointer to the SAME list head instance
+ * that was passed as a parameter to this function.
+ */
+int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
+ unsigned long *pi_buff)
+{
+ /*
+ * Dealt with as longs because of the hcall interface even though the
+ * values are uint32_t.
+ */
+ unsigned long last_p_partition_ID;
+ unsigned long last_p_unit_address;
+ struct hvcs_partner_info *next_partner_info = NULL;
+ int more = 1;
+ int retval;
+
+ memset(pi_buff, 0x00, PAGE_SIZE);
+ /* invalid parameters */
+ if (!head || !pi_buff)
+ return -EINVAL;
+
+ last_p_partition_ID = last_p_unit_address = ~0UL;
+ INIT_LIST_HEAD(head);
+
+ do {
+ retval = hvcs_next_partner(unit_address, last_p_partition_ID,
+ last_p_unit_address, pi_buff);
+ if (retval) {
+ /*
+ * Don't indicate that we've failed if we have
+ * any list elements.
+ */
+ if (!list_empty(head))
+ return 0;
+ return retval;
+ }
+
+ last_p_partition_ID = pi_buff[0];
+ last_p_unit_address = pi_buff[1];
+
+ /* This indicates that there are no further partners */
+ if (last_p_partition_ID == ~0UL
+ && last_p_unit_address == ~0UL)
+ break;
+
+ /* This is a very small struct and will be freed soon in
+ * hvcs_free_partner_info(). */
+ next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
+ GFP_ATOMIC);
+
+ if (!next_partner_info) {
+ printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
+ " allocate partner info struct.\n");
+ hvcs_free_partner_info(head);
+ return -ENOMEM;
+ }
+
+ next_partner_info->unit_address
+ = (unsigned int)last_p_unit_address;
+ next_partner_info->partition_ID
+ = (unsigned int)last_p_partition_ID;
+
+ /* copy the Null-term char too */
+ strncpy(&next_partner_info->location_code[0],
+ (char *)&pi_buff[2],
+ strlen((char *)&pi_buff[2]) + 1);
+
+ list_add_tail(&(next_partner_info->node), head);
+ next_partner_info = NULL;
+
+ } while (more);
+
+ return 0;
+}
+EXPORT_SYMBOL(hvcs_get_partner_info);
+
+/**
+ * hvcs_register_connection - establish a connection between this vty-server and
+ * a vty.
+ * @unit_address: The unit address of the vty-server adapter that is to be
+ * establish a connection.
+ * @p_partition_ID: The partition ID of the vty adapter that is to be connected.
+ * @p_unit_address: The unit address of the vty adapter to which the vty-server
+ * is to be connected.
+ *
+ * If this function is called once and -EINVAL is returned it may
+ * indicate that the partner info needs to be refreshed for the
+ * target unit address at which point the caller must invoke
+ * hvcs_get_partner_info() and then call this function again. If,
+ * for a second time, -EINVAL is returned then it indicates that
+ * there is probably already a partner connection registered to a
+ * different vty-server adapter. It is also possible that a second
+ * -EINVAL may indicate that one of the parms is not valid, for
+ * instance if the link was removed between the vty-server adapter
+ * and the vty adapter that you are trying to open. Don't shoot the
+ * messenger. Firmware implemented it this way.
+ */
+int hvcs_register_connection( uint32_t unit_address,
+ uint32_t p_partition_ID, uint32_t p_unit_address)
+{
+ long retval;
+ retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
+ p_partition_ID, p_unit_address);
+ return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_register_connection);
+
+/**
+ * hvcs_free_connection - free the connection between a vty-server and vty
+ * @unit_address: The unit address of the vty-server that is to have its
+ * connection severed.
+ *
+ * This function is used to free the partner connection between a vty-server
+ * adapter and a vty adapter.
+ *
+ * If -EBUSY is returned continue to call this function until 0 is returned.
+ */
+int hvcs_free_connection(uint32_t unit_address)
+{
+ long retval;
+ retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
+ return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_free_connection);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
new file mode 100644
index 000000000000..c78f2b290a73
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -0,0 +1,606 @@
+/*
+ * arch/ppc64/kernel/pSeries_iommu.c
+ *
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup:
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
+ *
+ * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/ppc-pci.h>
+#include <asm/udbg.h>
+
+#include "plpar_wrappers.h"
+
+#define DBG(fmt...)
+
+extern int is_python(struct device_node *);
+
+static void tce_build_pSeries(struct iommu_table *tbl, long index,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction)
+{
+ union tce_entry t;
+ union tce_entry *tp;
+
+ index <<= TCE_PAGE_FACTOR;
+ npages <<= TCE_PAGE_FACTOR;
+
+ t.te_word = 0;
+ t.te_rdwr = 1; // Read allowed
+
+ if (direction != DMA_TO_DEVICE)
+ t.te_pciwr = 1;
+
+ tp = ((union tce_entry *)tbl->it_base) + index;
+
+ while (npages--) {
+ /* can't move this out since we might cross LMB boundary */
+ t.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+
+ tp->te_word = t.te_word;
+
+ uaddr += TCE_PAGE_SIZE;
+ tp++;
+ }
+}
+
+
+static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
+{
+ union tce_entry t;
+ union tce_entry *tp;
+
+ npages <<= TCE_PAGE_FACTOR;
+ index <<= TCE_PAGE_FACTOR;
+
+ t.te_word = 0;
+ tp = ((union tce_entry *)tbl->it_base) + index;
+
+ while (npages--) {
+ tp->te_word = t.te_word;
+
+ tp++;
+ }
+}
+
+
+static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction)
+{
+ u64 rc;
+ union tce_entry tce;
+
+ tce.te_word = 0;
+ tce.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+ tce.te_rdwr = 1;
+ if (direction != DMA_TO_DEVICE)
+ tce.te_pciwr = 1;
+
+ while (npages--) {
+ rc = plpar_tce_put((u64)tbl->it_index,
+ (u64)tcenum << 12,
+ tce.te_word );
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+ printk("\tindex = 0x%lx\n", (u64)tbl->it_index);
+ printk("\ttcenum = 0x%lx\n", (u64)tcenum);
+ printk("\ttce val = 0x%lx\n", tce.te_word );
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+
+ tcenum++;
+ tce.te_rpn++;
+ }
+}
+
+static DEFINE_PER_CPU(void *, tce_page) = NULL;
+
+static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction)
+{
+ u64 rc;
+ union tce_entry tce, *tcep;
+ long l, limit;
+
+ tcenum <<= TCE_PAGE_FACTOR;
+ npages <<= TCE_PAGE_FACTOR;
+
+ if (npages == 1)
+ return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+ direction);
+
+ tcep = __get_cpu_var(tce_page);
+
+ /* This is safe to do since interrupts are off when we're called
+ * from iommu_alloc{,_sg}()
+ */
+ if (!tcep) {
+ tcep = (void *)__get_free_page(GFP_ATOMIC);
+ /* If allocation fails, fall back to the loop implementation */
+ if (!tcep)
+ return tce_build_pSeriesLP(tbl, tcenum, npages,
+ uaddr, direction);
+ __get_cpu_var(tce_page) = tcep;
+ }
+
+ tce.te_word = 0;
+ tce.te_rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+ tce.te_rdwr = 1;
+ if (direction != DMA_TO_DEVICE)
+ tce.te_pciwr = 1;
+
+ /* We can map max one pageful of TCEs at a time */
+ do {
+ /*
+ * Set up the page with TCE data, looping through and setting
+ * the values.
+ */
+ limit = min_t(long, npages, 4096/sizeof(union tce_entry));
+
+ for (l = 0; l < limit; l++) {
+ tcep[l] = tce;
+ tce.te_rpn++;
+ }
+
+ rc = plpar_tce_put_indirect((u64)tbl->it_index,
+ (u64)tcenum << 12,
+ (u64)virt_to_abs(tcep),
+ limit);
+
+ npages -= limit;
+ tcenum += limit;
+ } while (npages > 0 && !rc);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+ printk("\tindex = 0x%lx\n", (u64)tbl->it_index);
+ printk("\tnpages = 0x%lx\n", (u64)npages);
+ printk("\ttce[0] val = 0x%lx\n", tcep[0].te_word);
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+}
+
+static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+ u64 rc;
+ union tce_entry tce;
+
+ tcenum <<= TCE_PAGE_FACTOR;
+ npages <<= TCE_PAGE_FACTOR;
+
+ tce.te_word = 0;
+
+ while (npages--) {
+ rc = plpar_tce_put((u64)tbl->it_index,
+ (u64)tcenum << 12,
+ tce.te_word);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
+ printk("\tindex = 0x%lx\n", (u64)tbl->it_index);
+ printk("\ttcenum = 0x%lx\n", (u64)tcenum);
+ printk("\ttce val = 0x%lx\n", tce.te_word );
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+
+ tcenum++;
+ }
+}
+
+
+static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+ u64 rc;
+ union tce_entry tce;
+
+ tcenum <<= TCE_PAGE_FACTOR;
+ npages <<= TCE_PAGE_FACTOR;
+
+ tce.te_word = 0;
+
+ rc = plpar_tce_stuff((u64)tbl->it_index,
+ (u64)tcenum << 12,
+ tce.te_word,
+ npages);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
+ printk("\trc = %ld\n", rc);
+ printk("\tindex = 0x%lx\n", (u64)tbl->it_index);
+ printk("\tnpages = 0x%lx\n", (u64)npages);
+ printk("\ttce val = 0x%lx\n", tce.te_word );
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+}
+
+static void iommu_table_setparms(struct pci_controller *phb,
+ struct device_node *dn,
+ struct iommu_table *tbl)
+{
+ struct device_node *node;
+ unsigned long *basep;
+ unsigned int *sizep;
+
+ node = (struct device_node *)phb->arch_data;
+
+ basep = (unsigned long *)get_property(node, "linux,tce-base", NULL);
+ sizep = (unsigned int *)get_property(node, "linux,tce-size", NULL);
+ if (basep == NULL || sizep == NULL) {
+ printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "
+ "missing tce entries !\n", dn->full_name);
+ return;
+ }
+
+ tbl->it_base = (unsigned long)__va(*basep);
+ memset((void *)tbl->it_base, 0, *sizep);
+
+ tbl->it_busno = phb->bus->number;
+
+ /* Units of tce entries */
+ tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
+
+ /* Test if we are going over 2GB of DMA space */
+ if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
+ udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ }
+
+ phb->dma_window_base_cur += phb->dma_window_size;
+
+ /* Set the tce table size - measured in entries */
+ tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
+
+ tbl->it_index = 0;
+ tbl->it_blocksize = 16;
+ tbl->it_type = TCE_PCI;
+}
+
+/*
+ * iommu_table_setparms_lpar
+ *
+ * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
+ *
+ * ToDo: properly interpret the ibm,dma-window property. The definition is:
+ * logical-bus-number (1 word)
+ * phys-address (#address-cells words)
+ * size (#cell-size words)
+ *
+ * Currently we hard code these sizes (more or less).
+ */
+static void iommu_table_setparms_lpar(struct pci_controller *phb,
+ struct device_node *dn,
+ struct iommu_table *tbl,
+ unsigned int *dma_window)
+{
+ tbl->it_busno = PCI_DN(dn)->bussubno;
+
+ /* TODO: Parse field size properties properly. */
+ tbl->it_size = (((unsigned long)dma_window[4] << 32) |
+ (unsigned long)dma_window[5]) >> PAGE_SHIFT;
+ tbl->it_offset = (((unsigned long)dma_window[2] << 32) |
+ (unsigned long)dma_window[3]) >> PAGE_SHIFT;
+ tbl->it_base = 0;
+ tbl->it_index = dma_window[0];
+ tbl->it_blocksize = 16;
+ tbl->it_type = TCE_PCI;
+}
+
+static void iommu_bus_setup_pSeries(struct pci_bus *bus)
+{
+ struct device_node *dn;
+ struct iommu_table *tbl;
+ struct device_node *isa_dn, *isa_dn_orig;
+ struct device_node *tmp;
+ struct pci_dn *pci;
+ int children;
+
+ DBG("iommu_bus_setup_pSeries, bus %p, bus->self %p\n", bus, bus->self);
+
+ dn = pci_bus_to_OF_node(bus);
+ pci = PCI_DN(dn);
+
+ if (bus->self) {
+ /* This is not a root bus, any setup will be done for the
+ * device-side of the bridge in iommu_dev_setup_pSeries().
+ */
+ return;
+ }
+
+ /* Check if the ISA bus on the system is under
+ * this PHB.
+ */
+ isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
+
+ while (isa_dn && isa_dn != dn)
+ isa_dn = isa_dn->parent;
+
+ if (isa_dn_orig)
+ of_node_put(isa_dn_orig);
+
+ /* Count number of direct PCI children of the PHB.
+ * All PCI device nodes have class-code property, so it's
+ * an easy way to find them.
+ */
+ for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
+ if (get_property(tmp, "class-code", NULL))
+ children++;
+
+ DBG("Children: %d\n", children);
+
+ /* Calculate amount of DMA window per slot. Each window must be
+ * a power of two (due to pci_alloc_consistent requirements).
+ *
+ * Keep 256MB aside for PHBs with ISA.
+ */
+
+ if (!isa_dn) {
+ /* No ISA/IDE - just set window size and return */
+ pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
+
+ while (pci->phb->dma_window_size * children > 0x80000000ul)
+ pci->phb->dma_window_size >>= 1;
+ DBG("No ISA/IDE, window size is 0x%lx\n",
+ pci->phb->dma_window_size);
+ pci->phb->dma_window_base_cur = 0;
+
+ return;
+ }
+
+ /* If we have ISA, then we probably have an IDE
+ * controller too. Allocate a 128MB table but
+ * skip the first 128MB to avoid stepping on ISA
+ * space.
+ */
+ pci->phb->dma_window_size = 0x8000000ul;
+ pci->phb->dma_window_base_cur = 0x8000000ul;
+
+ tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+ iommu_table_setparms(pci->phb, dn, tbl);
+ pci->iommu_table = iommu_init_table(tbl);
+
+ /* Divide the rest (1.75GB) among the children */
+ pci->phb->dma_window_size = 0x80000000ul;
+ while (pci->phb->dma_window_size * children > 0x70000000ul)
+ pci->phb->dma_window_size >>= 1;
+
+ DBG("ISA/IDE, window size is 0x%lx\n", pci->phb->dma_window_size);
+
+}
+
+
+static void iommu_bus_setup_pSeriesLP(struct pci_bus *bus)
+{
+ struct iommu_table *tbl;
+ struct device_node *dn, *pdn;
+ struct pci_dn *ppci;
+ unsigned int *dma_window = NULL;
+
+ DBG("iommu_bus_setup_pSeriesLP, bus %p, bus->self %p\n", bus, bus->self);
+
+ dn = pci_bus_to_OF_node(bus);
+
+ /* Find nearest ibm,dma-window, walking up the device tree */
+ for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
+ dma_window = (unsigned int *)get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window != NULL)
+ break;
+ }
+
+ if (dma_window == NULL) {
+ DBG("iommu_bus_setup_pSeriesLP: bus %s seems to have no ibm,dma-window property\n", dn->full_name);
+ return;
+ }
+
+ ppci = pdn->data;
+ if (!ppci->iommu_table) {
+ /* Bussubno hasn't been copied yet.
+ * Do it now because iommu_table_setparms_lpar needs it.
+ */
+
+ ppci->bussubno = bus->number;
+
+ tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table),
+ GFP_KERNEL);
+
+ iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+
+ ppci->iommu_table = iommu_init_table(tbl);
+ }
+
+ if (pdn != dn)
+ PCI_DN(dn)->iommu_table = ppci->iommu_table;
+}
+
+
+static void iommu_dev_setup_pSeries(struct pci_dev *dev)
+{
+ struct device_node *dn, *mydn;
+ struct iommu_table *tbl;
+
+ DBG("iommu_dev_setup_pSeries, dev %p (%s)\n", dev, pci_name(dev));
+
+ mydn = dn = pci_device_to_OF_node(dev);
+
+ /* If we're the direct child of a root bus, then we need to allocate
+ * an iommu table ourselves. The bus setup code should have setup
+ * the window sizes already.
+ */
+ if (!dev->bus->self) {
+ DBG(" --> first child, no bridge. Allocating iommu table.\n");
+ tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+ iommu_table_setparms(PCI_DN(dn)->phb, dn, tbl);
+ PCI_DN(mydn)->iommu_table = iommu_init_table(tbl);
+
+ return;
+ }
+
+ /* If this device is further down the bus tree, search upwards until
+ * an already allocated iommu table is found and use that.
+ */
+
+ while (dn && dn->data && PCI_DN(dn)->iommu_table == NULL)
+ dn = dn->parent;
+
+ if (dn && dn->data) {
+ PCI_DN(mydn)->iommu_table = PCI_DN(dn)->iommu_table;
+ } else {
+ DBG("iommu_dev_setup_pSeries, dev %p (%s) has no iommu table\n", dev, pci_name(dev));
+ }
+}
+
+static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+ int err = NOTIFY_OK;
+ struct device_node *np = node;
+ struct pci_dn *pci = np->data;
+
+ switch (action) {
+ case PSERIES_RECONFIG_REMOVE:
+ if (pci && pci->iommu_table &&
+ get_property(np, "ibm,dma-window", NULL))
+ iommu_free_table(np);
+ break;
+ default:
+ err = NOTIFY_DONE;
+ break;
+ }
+ return err;
+}
+
+static struct notifier_block iommu_reconfig_nb = {
+ .notifier_call = iommu_reconfig_notifier,
+};
+
+static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev)
+{
+ struct device_node *pdn, *dn;
+ struct iommu_table *tbl;
+ int *dma_window = NULL;
+ struct pci_dn *pci;
+
+ DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev));
+
+ /* dev setup for LPAR is a little tricky, since the device tree might
+ * contain the dma-window properties per-device and not neccesarily
+ * for the bus. So we need to search upwards in the tree until we
+ * either hit a dma-window property, OR find a parent with a table
+ * already allocated.
+ */
+ dn = pci_device_to_OF_node(dev);
+
+ for (pdn = dn; pdn && pdn->data && !PCI_DN(pdn)->iommu_table;
+ pdn = pdn->parent) {
+ dma_window = (unsigned int *)
+ get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window)
+ break;
+ }
+
+ /* Check for parent == NULL so we don't try to setup the empty EADS
+ * slots on POWER4 machines.
+ */
+ if (dma_window == NULL || pdn->parent == NULL) {
+ DBG("No dma window for device, linking to parent\n");
+ PCI_DN(dn)->iommu_table = PCI_DN(pdn)->iommu_table;
+ return;
+ } else {
+ DBG("Found DMA window, allocating table\n");
+ }
+
+ pci = pdn->data;
+ if (!pci->iommu_table) {
+ /* iommu_table_setparms_lpar needs bussubno. */
+ pci->bussubno = pci->phb->bus->number;
+
+ tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table),
+ GFP_KERNEL);
+
+ iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+
+ pci->iommu_table = iommu_init_table(tbl);
+ }
+
+ if (pdn != dn)
+ PCI_DN(dn)->iommu_table = pci->iommu_table;
+}
+
+static void iommu_bus_setup_null(struct pci_bus *b) { }
+static void iommu_dev_setup_null(struct pci_dev *d) { }
+
+/* These are called very early. */
+void iommu_init_early_pSeries(void)
+{
+ if (of_chosen && get_property(of_chosen, "linux,iommu-off", NULL)) {
+ /* Direct I/O, IOMMU off */
+ ppc_md.iommu_dev_setup = iommu_dev_setup_null;
+ ppc_md.iommu_bus_setup = iommu_bus_setup_null;
+ pci_direct_iommu_init();
+
+ return;
+ }
+
+ if (platform_is_lpar()) {
+ if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
+ ppc_md.tce_build = tce_buildmulti_pSeriesLP;
+ ppc_md.tce_free = tce_freemulti_pSeriesLP;
+ } else {
+ ppc_md.tce_build = tce_build_pSeriesLP;
+ ppc_md.tce_free = tce_free_pSeriesLP;
+ }
+ ppc_md.iommu_bus_setup = iommu_bus_setup_pSeriesLP;
+ ppc_md.iommu_dev_setup = iommu_dev_setup_pSeriesLP;
+ } else {
+ ppc_md.tce_build = tce_build_pSeries;
+ ppc_md.tce_free = tce_free_pSeries;
+ ppc_md.iommu_bus_setup = iommu_bus_setup_pSeries;
+ ppc_md.iommu_dev_setup = iommu_dev_setup_pSeries;
+ }
+
+
+ pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
+
+ pci_iommu_init();
+}
+
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
new file mode 100644
index 000000000000..a50e5f3f396d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -0,0 +1,546 @@
+/*
+ * pSeries_lpar.c
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * pSeries LPAR support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#undef DEBUG_LOW
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/mmu_context.h>
+#include <asm/iommu.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/prom.h>
+#include <asm/abs_addr.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+
+#include "plpar_wrappers.h"
+
+#ifdef DEBUG_LOW
+#define DBG_LOW(fmt...) do { udbg_printf(fmt); } while(0)
+#else
+#define DBG_LOW(fmt...) do { } while(0)
+#endif
+
+/* in pSeries_hvCall.S */
+EXPORT_SYMBOL(plpar_hcall);
+EXPORT_SYMBOL(plpar_hcall_4out);
+EXPORT_SYMBOL(plpar_hcall_norets);
+EXPORT_SYMBOL(plpar_hcall_8arg_2ret);
+
+extern void pSeries_find_serial_port(void);
+
+
+int vtermno; /* virtual terminal# for udbg */
+
+#define __ALIGNED__ __attribute__((__aligned__(sizeof(long))))
+static void udbg_hvsi_putc(unsigned char c)
+{
+ /* packet's seqno isn't used anyways */
+ uint8_t packet[] __ALIGNED__ = { 0xff, 5, 0, 0, c };
+ int rc;
+
+ if (c == '\n')
+ udbg_hvsi_putc('\r');
+
+ do {
+ rc = plpar_put_term_char(vtermno, sizeof(packet), packet);
+ } while (rc == H_Busy);
+}
+
+static long hvsi_udbg_buf_len;
+static uint8_t hvsi_udbg_buf[256];
+
+static int udbg_hvsi_getc_poll(void)
+{
+ unsigned char ch;
+ int rc, i;
+
+ if (hvsi_udbg_buf_len == 0) {
+ rc = plpar_get_term_char(vtermno, &hvsi_udbg_buf_len, hvsi_udbg_buf);
+ if (rc != H_Success || hvsi_udbg_buf[0] != 0xff) {
+ /* bad read or non-data packet */
+ hvsi_udbg_buf_len = 0;
+ } else {
+ /* remove the packet header */
+ for (i = 4; i < hvsi_udbg_buf_len; i++)
+ hvsi_udbg_buf[i-4] = hvsi_udbg_buf[i];
+ hvsi_udbg_buf_len -= 4;
+ }
+ }
+
+ if (hvsi_udbg_buf_len <= 0 || hvsi_udbg_buf_len > 256) {
+ /* no data ready */
+ hvsi_udbg_buf_len = 0;
+ return -1;
+ }
+
+ ch = hvsi_udbg_buf[0];
+ /* shift remaining data down */
+ for (i = 1; i < hvsi_udbg_buf_len; i++) {
+ hvsi_udbg_buf[i-1] = hvsi_udbg_buf[i];
+ }
+ hvsi_udbg_buf_len--;
+
+ return ch;
+}
+
+static unsigned char udbg_hvsi_getc(void)
+{
+ int ch;
+ for (;;) {
+ ch = udbg_hvsi_getc_poll();
+ if (ch == -1) {
+ /* This shouldn't be needed...but... */
+ volatile unsigned long delay;
+ for (delay=0; delay < 2000000; delay++)
+ ;
+ } else {
+ return ch;
+ }
+ }
+}
+
+static void udbg_putcLP(unsigned char c)
+{
+ char buf[16];
+ unsigned long rc;
+
+ if (c == '\n')
+ udbg_putcLP('\r');
+
+ buf[0] = c;
+ do {
+ rc = plpar_put_term_char(vtermno, 1, buf);
+ } while(rc == H_Busy);
+}
+
+/* Buffered chars getc */
+static long inbuflen;
+static long inbuf[2]; /* must be 2 longs */
+
+static int udbg_getc_pollLP(void)
+{
+ /* The interface is tricky because it may return up to 16 chars.
+ * We save them statically for future calls to udbg_getc().
+ */
+ char ch, *buf = (char *)inbuf;
+ int i;
+ long rc;
+ if (inbuflen == 0) {
+ /* get some more chars. */
+ inbuflen = 0;
+ rc = plpar_get_term_char(vtermno, &inbuflen, buf);
+ if (rc != H_Success)
+ inbuflen = 0; /* otherwise inbuflen is garbage */
+ }
+ if (inbuflen <= 0 || inbuflen > 16) {
+ /* Catch error case as well as other oddities (corruption) */
+ inbuflen = 0;
+ return -1;
+ }
+ ch = buf[0];
+ for (i = 1; i < inbuflen; i++) /* shuffle them down. */
+ buf[i-1] = buf[i];
+ inbuflen--;
+ return ch;
+}
+
+static unsigned char udbg_getcLP(void)
+{
+ int ch;
+ for (;;) {
+ ch = udbg_getc_pollLP();
+ if (ch == -1) {
+ /* This shouldn't be needed...but... */
+ volatile unsigned long delay;
+ for (delay=0; delay < 2000000; delay++)
+ ;
+ } else {
+ return ch;
+ }
+ }
+}
+
+/* call this from early_init() for a working debug console on
+ * vterm capable LPAR machines
+ */
+void udbg_init_debug_lpar(void)
+{
+ vtermno = 0;
+ udbg_putc = udbg_putcLP;
+ udbg_getc = udbg_getcLP;
+ udbg_getc_poll = udbg_getc_pollLP;
+}
+
+/* returns 0 if couldn't find or use /chosen/stdout as console */
+int find_udbg_vterm(void)
+{
+ struct device_node *stdout_node;
+ u32 *termno;
+ char *name;
+ int found = 0;
+
+ /* find the boot console from /chosen/stdout */
+ if (!of_chosen)
+ return 0;
+ name = (char *)get_property(of_chosen, "linux,stdout-path", NULL);
+ if (name == NULL)
+ return 0;
+ stdout_node = of_find_node_by_path(name);
+ if (!stdout_node)
+ return 0;
+
+ /* now we have the stdout node; figure out what type of device it is. */
+ name = (char *)get_property(stdout_node, "name", NULL);
+ if (!name) {
+ printk(KERN_WARNING "stdout node missing 'name' property!\n");
+ goto out;
+ }
+
+ if (strncmp(name, "vty", 3) == 0) {
+ if (device_is_compatible(stdout_node, "hvterm1")) {
+ termno = (u32 *)get_property(stdout_node, "reg", NULL);
+ if (termno) {
+ vtermno = termno[0];
+ udbg_putc = udbg_putcLP;
+ udbg_getc = udbg_getcLP;
+ udbg_getc_poll = udbg_getc_pollLP;
+ found = 1;
+ }
+ } else if (device_is_compatible(stdout_node, "hvterm-protocol")) {
+ termno = (u32 *)get_property(stdout_node, "reg", NULL);
+ if (termno) {
+ vtermno = termno[0];
+ udbg_putc = udbg_hvsi_putc;
+ udbg_getc = udbg_hvsi_getc;
+ udbg_getc_poll = udbg_hvsi_getc_poll;
+ found = 1;
+ }
+ }
+ } else if (strncmp(name, "serial", 6)) {
+ /* XXX fix ISA serial console */
+ printk(KERN_WARNING "serial stdout on LPAR ('%s')! "
+ "can't print udbg messages\n",
+ stdout_node->full_name);
+ } else {
+ printk(KERN_WARNING "don't know how to print to stdout '%s'\n",
+ stdout_node->full_name);
+ }
+
+out:
+ of_node_put(stdout_node);
+ return found;
+}
+
+void vpa_init(int cpu)
+{
+ int hwcpu = get_hard_smp_processor_id(cpu);
+ unsigned long vpa = __pa(&paca[cpu].lppaca);
+ long ret;
+
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ paca[cpu].lppaca.vmxregs_in_use = 1;
+
+ ret = register_vpa(hwcpu, vpa);
+
+ if (ret)
+ printk(KERN_ERR "WARNING: vpa_init: VPA registration for "
+ "cpu %d (hw %d) of area %lx returns %ld\n",
+ cpu, hwcpu, vpa, ret);
+}
+
+long pSeries_lpar_hpte_insert(unsigned long hpte_group,
+ unsigned long va, unsigned long pa,
+ unsigned long rflags, unsigned long vflags,
+ int psize)
+{
+ unsigned long lpar_rc;
+ unsigned long flags;
+ unsigned long slot;
+ unsigned long hpte_v, hpte_r;
+ unsigned long dummy0, dummy1;
+
+ if (!(vflags & HPTE_V_BOLTED))
+ DBG_LOW("hpte_insert(group=%lx, va=%016lx, pa=%016lx, "
+ "rflags=%lx, vflags=%lx, psize=%d)\n",
+ hpte_group, va, pa, rflags, vflags, psize);
+
+ hpte_v = hpte_encode_v(va, psize) | vflags | HPTE_V_VALID;
+ hpte_r = hpte_encode_r(pa, psize) | rflags;
+
+ if (!(vflags & HPTE_V_BOLTED))
+ DBG_LOW(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
+
+#if 1
+ {
+ int i;
+ for (i=0;i<8;i++) {
+ unsigned long w0, w1;
+ plpar_pte_read(0, hpte_group, &w0, &w1);
+ BUG_ON (HPTE_V_COMPARE(hpte_v, w0)
+ && (w0 & HPTE_V_VALID));
+ }
+ }
+#endif
+
+ /* Now fill in the actual HPTE */
+ /* Set CEC cookie to 0 */
+ /* Zero page = 0 */
+ /* I-cache Invalidate = 0 */
+ /* I-cache synchronize = 0 */
+ /* Exact = 0 */
+ flags = 0;
+
+ /* Make pHyp happy */
+ if (rflags & (_PAGE_GUARDED|_PAGE_NO_CACHE))
+ hpte_r &= ~_PAGE_COHERENT;
+
+ lpar_rc = plpar_hcall(H_ENTER, flags, hpte_group, hpte_v,
+ hpte_r, &slot, &dummy0, &dummy1);
+ if (unlikely(lpar_rc == H_PTEG_Full)) {
+ if (!(vflags & HPTE_V_BOLTED))
+ DBG_LOW(" full\n");
+ return -1;
+ }
+
+ /*
+ * Since we try and ioremap PHBs we don't own, the pte insert
+ * will fail. However we must catch the failure in hash_page
+ * or we will loop forever, so return -2 in this case.
+ */
+ if (unlikely(lpar_rc != H_Success)) {
+ if (!(vflags & HPTE_V_BOLTED))
+ DBG_LOW(" lpar err %d\n", lpar_rc);
+ return -2;
+ }
+ if (!(vflags & HPTE_V_BOLTED))
+ DBG_LOW(" -> slot: %d\n", slot & 7);
+
+ /* Because of iSeries, we have to pass down the secondary
+ * bucket bit here as well
+ */
+ return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
+
+static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
+{
+ unsigned long slot_offset;
+ unsigned long lpar_rc;
+ int i;
+ unsigned long dummy1, dummy2;
+
+ /* pick a random slot to start at */
+ slot_offset = mftb() & 0x7;
+
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+ /* don't remove a bolted entry */
+ lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
+ (0x1UL << 4), &dummy1, &dummy2);
+ if (lpar_rc == H_Success)
+ return i;
+ BUG_ON(lpar_rc != H_Not_Found);
+
+ slot_offset++;
+ slot_offset &= 0x7;
+ }
+
+ return -1;
+}
+
+static void pSeries_lpar_hptab_clear(void)
+{
+ unsigned long size_bytes = 1UL << ppc64_pft_size;
+ unsigned long hpte_count = size_bytes >> 4;
+ unsigned long dummy1, dummy2;
+ int i;
+
+ /* TODO: Use bulk call */
+ for (i = 0; i < hpte_count; i++)
+ plpar_pte_remove(0, i, 0, &dummy1, &dummy2);
+}
+
+/*
+ * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
+ * the low 3 bits of flags happen to line up. So no transform is needed.
+ * We can probably optimize here and assume the high bits of newpp are
+ * already zero. For now I am paranoid.
+ */
+static long pSeries_lpar_hpte_updatepp(unsigned long slot,
+ unsigned long newpp,
+ unsigned long va,
+ int psize, int local)
+{
+ unsigned long lpar_rc;
+ unsigned long flags = (newpp & 7) | H_AVPN;
+ unsigned long want_v;
+
+ want_v = hpte_encode_v(va, psize);
+
+ DBG_LOW(" update: avpnv=%016lx, hash=%016lx, f=%x, psize: %d ... ",
+ want_v & HPTE_V_AVPN, slot, flags, psize);
+
+ lpar_rc = plpar_pte_protect(flags, slot, want_v & HPTE_V_AVPN);
+
+ if (lpar_rc == H_Not_Found) {
+ DBG_LOW("not found !\n");
+ return -1;
+ }
+
+ DBG_LOW("ok\n");
+
+ BUG_ON(lpar_rc != H_Success);
+
+ return 0;
+}
+
+static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
+{
+ unsigned long dword0;
+ unsigned long lpar_rc;
+ unsigned long dummy_word1;
+ unsigned long flags;
+
+ /* Read 1 pte at a time */
+ /* Do not need RPN to logical page translation */
+ /* No cross CEC PFT access */
+ flags = 0;
+
+ lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
+
+ BUG_ON(lpar_rc != H_Success);
+
+ return dword0;
+}
+
+static long pSeries_lpar_hpte_find(unsigned long va, int psize)
+{
+ unsigned long hash;
+ unsigned long i, j;
+ long slot;
+ unsigned long want_v, hpte_v;
+
+ hash = hpt_hash(va, mmu_psize_defs[psize].shift);
+ want_v = hpte_encode_v(va, psize);
+
+ for (j = 0; j < 2; j++) {
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ hpte_v = pSeries_lpar_hpte_getword0(slot);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v)
+ && (hpte_v & HPTE_V_VALID)
+ && (!!(hpte_v & HPTE_V_SECONDARY) == j)) {
+ /* HPTE matches */
+ if (j)
+ slot = -slot;
+ return slot;
+ }
+ ++slot;
+ }
+ hash = ~hash;
+ }
+
+ return -1;
+}
+
+static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
+ unsigned long ea,
+ int psize)
+{
+ unsigned long lpar_rc, slot, vsid, va, flags;
+
+ vsid = get_kernel_vsid(ea);
+ va = (vsid << 28) | (ea & 0x0fffffff);
+
+ slot = pSeries_lpar_hpte_find(va, psize);
+ BUG_ON(slot == -1);
+
+ flags = newpp & 7;
+ lpar_rc = plpar_pte_protect(flags, slot, 0);
+
+ BUG_ON(lpar_rc != H_Success);
+}
+
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
+ int psize, int local)
+{
+ unsigned long want_v;
+ unsigned long lpar_rc;
+ unsigned long dummy1, dummy2;
+
+ DBG_LOW(" inval : slot=%lx, va=%016lx, psize: %d, local: %d",
+ slot, va, psize, local);
+
+ want_v = hpte_encode_v(va, psize);
+ lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v & HPTE_V_AVPN,
+ &dummy1, &dummy2);
+ if (lpar_rc == H_Not_Found)
+ return;
+
+ BUG_ON(lpar_rc != H_Success);
+}
+
+/*
+ * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
+ * lock.
+ */
+void pSeries_lpar_flush_hash_range(unsigned long number, int local)
+{
+ int i;
+ unsigned long flags = 0;
+ struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+ int lock_tlbie = !cpu_has_feature(CPU_FTR_LOCKLESS_TLBIE);
+
+ if (lock_tlbie)
+ spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+ for (i = 0; i < number; i++)
+ flush_hash_page(batch->vaddr[i], batch->pte[i],
+ batch->psize, local);
+
+ if (lock_tlbie)
+ spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+void hpte_init_lpar(void)
+{
+ ppc_md.hpte_invalidate = pSeries_lpar_hpte_invalidate;
+ ppc_md.hpte_updatepp = pSeries_lpar_hpte_updatepp;
+ ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
+ ppc_md.hpte_insert = pSeries_lpar_hpte_insert;
+ ppc_md.hpte_remove = pSeries_lpar_hpte_remove;
+ ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range;
+ ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear;
+
+ htab_finish_init();
+}
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
new file mode 100644
index 000000000000..18abfb1f4e24
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -0,0 +1,148 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+static unsigned int nvram_size;
+static int nvram_fetch, nvram_store;
+static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */
+static DEFINE_SPINLOCK(nvram_lock);
+
+
+static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
+{
+ unsigned int i;
+ unsigned long len;
+ int done;
+ unsigned long flags;
+ char *p = buf;
+
+
+ if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ if (*index >= nvram_size)
+ return 0;
+
+ i = *index;
+ if (i + count > nvram_size)
+ count = nvram_size - i;
+
+ spin_lock_irqsave(&nvram_lock, flags);
+
+ for (; count != 0; count -= len) {
+ len = count;
+ if (len > NVRW_CNT)
+ len = NVRW_CNT;
+
+ if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
+ len) != 0) || len != done) {
+ spin_unlock_irqrestore(&nvram_lock, flags);
+ return -EIO;
+ }
+
+ memcpy(p, nvram_buf, len);
+
+ p += len;
+ i += len;
+ }
+
+ spin_unlock_irqrestore(&nvram_lock, flags);
+
+ *index = i;
+ return p - buf;
+}
+
+static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
+{
+ unsigned int i;
+ unsigned long len;
+ int done;
+ unsigned long flags;
+ const char *p = buf;
+
+ if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ if (*index >= nvram_size)
+ return 0;
+
+ i = *index;
+ if (i + count > nvram_size)
+ count = nvram_size - i;
+
+ spin_lock_irqsave(&nvram_lock, flags);
+
+ for (; count != 0; count -= len) {
+ len = count;
+ if (len > NVRW_CNT)
+ len = NVRW_CNT;
+
+ memcpy(nvram_buf, p, len);
+
+ if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
+ len) != 0) || len != done) {
+ spin_unlock_irqrestore(&nvram_lock, flags);
+ return -EIO;
+ }
+
+ p += len;
+ i += len;
+ }
+ spin_unlock_irqrestore(&nvram_lock, flags);
+
+ *index = i;
+ return p - buf;
+}
+
+static ssize_t pSeries_nvram_get_size(void)
+{
+ return nvram_size ? nvram_size : -ENODEV;
+}
+
+int __init pSeries_nvram_init(void)
+{
+ struct device_node *nvram;
+ unsigned int *nbytes_p, proplen;
+
+ nvram = of_find_node_by_type(NULL, "nvram");
+ if (nvram == NULL)
+ return -ENODEV;
+
+ nbytes_p = (unsigned int *)get_property(nvram, "#bytes", &proplen);
+ if (nbytes_p == NULL || proplen != sizeof(unsigned int))
+ return -EIO;
+
+ nvram_size = *nbytes_p;
+
+ nvram_fetch = rtas_token("nvram-fetch");
+ nvram_store = rtas_token("nvram-store");
+ printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
+ of_node_put(nvram);
+
+ ppc_md.nvram_read = pSeries_nvram_read;
+ ppc_md.nvram_write = pSeries_nvram_write;
+ ppc_md.nvram_size = pSeries_nvram_get_size;
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
new file mode 100644
index 000000000000..999a9620b5ce
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -0,0 +1,141 @@
+/*
+ * arch/ppc64/kernel/pSeries_pci.c
+ *
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * pSeries specific routines for PCI.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/ppc-pci.h>
+
+static int __devinitdata s7a_workaround = -1;
+
+#if 0
+void pcibios_name_device(struct pci_dev *dev)
+{
+ struct device_node *dn;
+
+ /*
+ * Add IBM loc code (slot) as a prefix to the device names for service
+ */
+ dn = pci_device_to_OF_node(dev);
+ if (dn) {
+ char *loc_code = get_property(dn, "ibm,loc-code", 0);
+ if (loc_code) {
+ int loc_len = strlen(loc_code);
+ if (loc_len < sizeof(dev->dev.name)) {
+ memmove(dev->dev.name+loc_len+1, dev->dev.name,
+ sizeof(dev->dev.name)-loc_len-1);
+ memcpy(dev->dev.name, loc_code, loc_len);
+ dev->dev.name[loc_len] = ' ';
+ dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
+ }
+ }
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
+#endif
+
+static void __devinit check_s7a(void)
+{
+ struct device_node *root;
+ char *model;
+
+ s7a_workaround = 0;
+ root = of_find_node_by_path("/");
+ if (root) {
+ model = get_property(root, "model", NULL);
+ if (model && !strcmp(model, "IBM,7013-S7A"))
+ s7a_workaround = 1;
+ of_node_put(root);
+ }
+}
+
+void __devinit pSeries_irq_bus_setup(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ if (s7a_workaround < 0)
+ check_s7a();
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ pci_read_irq_line(dev);
+ if (s7a_workaround) {
+ if (dev->irq > 16) {
+ dev->irq -= 3;
+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE,
+ dev->irq);
+ }
+ }
+ }
+}
+
+static void __init pSeries_request_regions(void)
+{
+ if (!isa_io_base)
+ return;
+
+ request_region(0x20,0x20,"pic1");
+ request_region(0xa0,0x20,"pic2");
+ request_region(0x00,0x20,"dma1");
+ request_region(0x40,0x20,"timer");
+ request_region(0x80,0x10,"dma page reg");
+ request_region(0xc0,0x20,"dma2");
+}
+
+void __init pSeries_final_fixup(void)
+{
+ pSeries_request_regions();
+
+ pci_addr_cache_build();
+}
+
+/*
+ * Assume the winbond 82c105 is the IDE controller on a
+ * p610. We should probably be more careful in case
+ * someone tries to plug in a similar adapter.
+ */
+static void fixup_winbond_82c105(struct pci_dev* dev)
+{
+ int i;
+ unsigned int reg;
+
+ if (!platform_is_pseries())
+ return;
+
+ printk("Using INTC for W82c105 IDE controller.\n");
+ pci_read_config_dword(dev, 0x40, &reg);
+ /* Enable LEGIRQ to use INTC instead of ISA interrupts */
+ pci_write_config_dword(dev, 0x40, reg | (1<<11));
+
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+ /* zap the 2nd function of the winbond chip */
+ if (dev->resource[i].flags & IORESOURCE_IO
+ && dev->bus->number == 0 && dev->devfn == 0x81)
+ dev->resource[i].flags &= ~IORESOURCE_IO;
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
+ fixup_winbond_82c105);
diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
new file mode 100644
index 000000000000..3bd1b3e06003
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -0,0 +1,110 @@
+#ifndef _PSERIES_PLPAR_WRAPPERS_H
+#define _PSERIES_PLPAR_WRAPPERS_H
+
+#include <asm/hvcall.h>
+
+static inline long poll_pending(void)
+{
+ unsigned long dummy;
+ return plpar_hcall(H_POLL_PENDING, 0, 0, 0, 0, &dummy, &dummy, &dummy);
+}
+
+static inline long prod_processor(void)
+{
+ plpar_hcall_norets(H_PROD);
+ return 0;
+}
+
+static inline long cede_processor(void)
+{
+ plpar_hcall_norets(H_CEDE);
+ return 0;
+}
+
+static inline long vpa_call(unsigned long flags, unsigned long cpu,
+ unsigned long vpa)
+{
+ /* flags are in bits 16-18 (counting from most significant bit) */
+ flags = flags << (63 - 18);
+
+ return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
+}
+
+static inline long unregister_vpa(unsigned long cpu, unsigned long vpa)
+{
+ return vpa_call(0x5, cpu, vpa);
+}
+
+static inline long register_vpa(unsigned long cpu, unsigned long vpa)
+{
+ return vpa_call(0x1, cpu, vpa);
+}
+
+extern void vpa_init(int cpu);
+
+static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex,
+ unsigned long avpn, unsigned long *old_pteh_ret,
+ unsigned long *old_ptel_ret)
+{
+ unsigned long dummy;
+ return plpar_hcall(H_REMOVE, flags, ptex, avpn, 0, old_pteh_ret,
+ old_ptel_ret, &dummy);
+}
+
+static inline long plpar_pte_read(unsigned long flags, unsigned long ptex,
+ unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
+{
+ unsigned long dummy;
+ return plpar_hcall(H_READ, flags, ptex, 0, 0, old_pteh_ret,
+ old_ptel_ret, &dummy);
+}
+
+static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex,
+ unsigned long avpn)
+{
+ return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
+}
+
+static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba,
+ unsigned long *tce_ret)
+{
+ unsigned long dummy;
+ return plpar_hcall(H_GET_TCE, liobn, ioba, 0, 0, tce_ret, &dummy,
+ &dummy);
+}
+
+static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba,
+ unsigned long tceval)
+{
+ return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval);
+}
+
+static inline long plpar_tce_put_indirect(unsigned long liobn,
+ unsigned long ioba, unsigned long page, unsigned long count)
+{
+ return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count);
+}
+
+static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba,
+ unsigned long tceval, unsigned long count)
+{
+ return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count);
+}
+
+static inline long plpar_get_term_char(unsigned long termno,
+ unsigned long *len_ret, char *buf_ret)
+{
+ unsigned long *lbuf = (unsigned long *)buf_ret; /* TODO: alignment? */
+ return plpar_hcall(H_GET_TERM_CHAR, termno, 0, 0, 0, len_ret,
+ lbuf + 0, lbuf + 1);
+}
+
+static inline long plpar_put_term_char(unsigned long termno, unsigned long len,
+ const char *buffer)
+{
+ unsigned long *lbuf = (unsigned long *)buffer; /* TODO: alignment? */
+ return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0],
+ lbuf[1]);
+}
+
+#endif /* _PSERIES_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
new file mode 100644
index 000000000000..fbd214d68b07
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* Change Activity:
+ * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support.
+ * End Change Activity
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/kernel_stat.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/random.h>
+#include <linux/sysrq.h>
+#include <linux/bitops.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/cache.h>
+#include <asm/prom.h>
+#include <asm/ptrace.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/udbg.h>
+
+static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(ras_log_buf_lock);
+
+char mce_data_buf[RTAS_ERROR_LOG_MAX]
+;
+/* This is true if we are using the firmware NMI handler (typically LPAR) */
+extern int fwnmi_active;
+
+static int ras_get_sensor_state_token;
+static int ras_check_exception_token;
+
+#define EPOW_SENSOR_TOKEN 9
+#define EPOW_SENSOR_INDEX 0
+#define RAS_VECTOR_OFFSET 0x500
+
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id,
+ struct pt_regs * regs);
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id,
+ struct pt_regs * regs);
+
+/* #define DEBUG */
+
+static void request_ras_irqs(struct device_node *np, char *propname,
+ irqreturn_t (*handler)(int, void *, struct pt_regs *),
+ const char *name)
+{
+ unsigned int *ireg, len, i;
+ int virq, n_intr;
+
+ ireg = (unsigned int *)get_property(np, propname, &len);
+ if (ireg == NULL)
+ return;
+ n_intr = prom_n_intr_cells(np);
+ len /= n_intr * sizeof(*ireg);
+
+ for (i = 0; i < len; i++) {
+ virq = virt_irq_create_mapping(*ireg);
+ if (virq == NO_IRQ) {
+ printk(KERN_ERR "Unable to allocate interrupt "
+ "number for %s\n", np->full_name);
+ return;
+ }
+ if (request_irq(irq_offset_up(virq), handler, 0, name, NULL)) {
+ printk(KERN_ERR "Unable to request interrupt %d for "
+ "%s\n", irq_offset_up(virq), np->full_name);
+ return;
+ }
+ ireg += n_intr;
+ }
+}
+
+/*
+ * Initialize handlers for the set of interrupts caused by hardware errors
+ * and power system events.
+ */
+static int __init init_ras_IRQ(void)
+{
+ struct device_node *np;
+
+ ras_get_sensor_state_token = rtas_token("get-sensor-state");
+ ras_check_exception_token = rtas_token("check-exception");
+
+ /* Internal Errors */
+ np = of_find_node_by_path("/event-sources/internal-errors");
+ if (np != NULL) {
+ request_ras_irqs(np, "open-pic-interrupt", ras_error_interrupt,
+ "RAS_ERROR");
+ request_ras_irqs(np, "interrupts", ras_error_interrupt,
+ "RAS_ERROR");
+ of_node_put(np);
+ }
+
+ /* EPOW Events */
+ np = of_find_node_by_path("/event-sources/epow-events");
+ if (np != NULL) {
+ request_ras_irqs(np, "open-pic-interrupt", ras_epow_interrupt,
+ "RAS_EPOW");
+ request_ras_irqs(np, "interrupts", ras_epow_interrupt,
+ "RAS_EPOW");
+ of_node_put(np);
+ }
+
+ return 1;
+}
+__initcall(init_ras_IRQ);
+
+/*
+ * Handle power subsystem events (EPOW).
+ *
+ * Presently we just log the event has occurred. This should be fixed
+ * to examine the type of power failure and take appropriate action where
+ * the time horizon permits something useful to be done.
+ */
+static irqreturn_t
+ras_epow_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+ int status = 0xdeadbeef;
+ int state = 0;
+ int critical;
+
+ status = rtas_call(ras_get_sensor_state_token, 2, 2, &state,
+ EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX);
+
+ if (state > 3)
+ critical = 1; /* Time Critical */
+ else
+ critical = 0;
+
+ spin_lock(&ras_log_buf_lock);
+
+ status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+ RAS_VECTOR_OFFSET,
+ virt_irq_to_real(irq_offset_down(irq)),
+ RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
+ critical, __pa(&ras_log_buf),
+ rtas_get_error_log_max());
+
+ udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status, state);
+ printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status, state);
+
+ /* format and print the extended information */
+ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+ spin_unlock(&ras_log_buf_lock);
+ return IRQ_HANDLED;
+}
+
+/*
+ * Handle hardware error interrupts.
+ *
+ * RTAS check-exception is called to collect data on the exception. If
+ * the error is deemed recoverable, we log a warning and return.
+ * For nonrecoverable errors, an error is logged and we stop all processing
+ * as quickly as possible in order to prevent propagation of the failure.
+ */
+static irqreturn_t
+ras_error_interrupt(int irq, void *dev_id, struct pt_regs * regs)
+{
+ struct rtas_error_log *rtas_elog;
+ int status = 0xdeadbeef;
+ int fatal;
+
+ spin_lock(&ras_log_buf_lock);
+
+ status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+ RAS_VECTOR_OFFSET,
+ virt_irq_to_real(irq_offset_down(irq)),
+ RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
+ __pa(&ras_log_buf),
+ rtas_get_error_log_max());
+
+ rtas_elog = (struct rtas_error_log *)ras_log_buf;
+
+ if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC))
+ fatal = 1;
+ else
+ fatal = 0;
+
+ /* format and print the extended information */
+ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
+
+ if (fatal) {
+ udbg_printf("Fatal HW Error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+ printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+
+#ifndef DEBUG
+ /* Don't actually power off when debugging so we can test
+ * without actually failing while injecting errors.
+ * Error data will not be logged to syslog.
+ */
+ ppc_md.power_off();
+#endif
+ } else {
+ udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+ printk(KERN_WARNING
+ "Warning: Recoverable hardware error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+ }
+
+ spin_unlock(&ras_log_buf_lock);
+ return IRQ_HANDLED;
+}
+
+/* Get the error information for errors coming through the
+ * FWNMI vectors. The pt_regs' r3 will be updated to reflect
+ * the actual r3 if possible, and a ptr to the error log entry
+ * will be returned if found.
+ *
+ * The mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log. This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
+ */
+static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+{
+ unsigned long errdata = regs->gpr[3];
+ struct rtas_error_log *errhdr = NULL;
+ unsigned long *savep;
+
+ if ((errdata >= 0x7000 && errdata < 0x7fff0) ||
+ (errdata >= rtas.base && errdata < rtas.base + rtas.size - 16)) {
+ savep = __va(errdata);
+ regs->gpr[3] = savep[0]; /* restore original r3 */
+ memset(mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+ memcpy(mce_data_buf, (char *)(savep + 1), RTAS_ERROR_LOG_MAX);
+ errhdr = (struct rtas_error_log *)mce_data_buf;
+ } else {
+ printk("FWNMI: corrupt r3\n");
+ }
+ return errhdr;
+}
+
+/* Call this when done with the data returned by FWNMI_get_errinfo.
+ * It will release the saved data area for other CPUs in the
+ * partition to receive FWNMI errors.
+ */
+static void fwnmi_release_errinfo(void)
+{
+ int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+ if (ret != 0)
+ printk("FWNMI: nmi-interlock failed: %d\n", ret);
+}
+
+void pSeries_system_reset_exception(struct pt_regs *regs)
+{
+ if (fwnmi_active) {
+ struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
+ if (errhdr) {
+ /* XXX Should look at FWNMI information */
+ }
+ fwnmi_release_errinfo();
+ }
+}
+
+/*
+ * See if we can recover from a machine check exception.
+ * This is only called on power4 (or above) and only via
+ * the Firmware Non-Maskable Interrupts (fwnmi) handler
+ * which provides the error analysis for us.
+ *
+ * Return 1 if corrected (or delivered a signal).
+ * Return 0 if there is nothing we can do.
+ */
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
+{
+ int nonfatal = 0;
+
+ if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+ /* Platform corrected itself */
+ nonfatal = 1;
+ } else if ((regs->msr & MSR_RI) &&
+ user_mode(regs) &&
+ err->severity == RTAS_SEVERITY_ERROR_SYNC &&
+ err->disposition == RTAS_DISP_NOT_RECOVERED &&
+ err->target == RTAS_TARGET_MEMORY &&
+ err->type == RTAS_TYPE_ECC_UNCORR &&
+ !(current->pid == 0 || current->pid == 1)) {
+ /* Kill off a user process with an ECC error */
+ printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
+ current->pid);
+ /* XXX something better for ECC error? */
+ _exception(SIGBUS, regs, BUS_ADRERR, regs->nip);
+ nonfatal = 1;
+ }
+
+ log_error((char *)err, ERR_TYPE_RTAS_LOG, !nonfatal);
+
+ return nonfatal;
+}
+
+/*
+ * Handle a machine check.
+ *
+ * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
+ * should be present. If so the handler which called us tells us if the
+ * error was recovered (never true if RI=0).
+ *
+ * On hardware prior to Power 4 these exceptions were asynchronous which
+ * means we can't tell exactly where it occurred and so we can't recover.
+ */
+int pSeries_machine_check_exception(struct pt_regs *regs)
+{
+ struct rtas_error_log *errp;
+
+ if (fwnmi_active) {
+ errp = fwnmi_get_errinfo(regs);
+ fwnmi_release_errinfo();
+ if (errp && recover_mce(regs, errp))
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
new file mode 100644
index 000000000000..d8864164dbe8
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -0,0 +1,424 @@
+/*
+ * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms).
+ *
+ * Copyright (C) 2005 Nathan Lynch
+ * Copyright (C) 2005 IBM Corporation
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+
+#include <asm/prom.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/uaccess.h>
+
+
+
+/*
+ * Routines for "runtime" addition and removal of device tree nodes.
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+/*
+ * Add a node to /proc/device-tree.
+ */
+static void add_node_proc_entries(struct device_node *np)
+{
+ struct proc_dir_entry *ent;
+
+ ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
+ if (ent)
+ proc_device_tree_add_node(np, ent);
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+ struct property *pp = np->properties;
+ struct device_node *parent = np->parent;
+
+ while (pp) {
+ remove_proc_entry(pp->name, np->pde);
+ pp = pp->next;
+ }
+ if (np->pde)
+ remove_proc_entry(np->pde->name, parent->pde);
+}
+#else /* !CONFIG_PROC_DEVICETREE */
+static void add_node_proc_entries(struct device_node *np)
+{
+ return;
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+ return;
+}
+#endif /* CONFIG_PROC_DEVICETREE */
+
+/**
+ * derive_parent - basically like dirname(1)
+ * @path: the full_name of a node to be added to the tree
+ *
+ * Returns the node which should be the parent of the node
+ * described by path. E.g., for path = "/foo/bar", returns
+ * the node with full_name = "/foo".
+ */
+static struct device_node *derive_parent(const char *path)
+{
+ struct device_node *parent = NULL;
+ char *parent_path = "/";
+ size_t parent_path_len = strrchr(path, '/') - path + 1;
+
+ /* reject if path is "/" */
+ if (!strcmp(path, "/"))
+ return ERR_PTR(-EINVAL);
+
+ if (strrchr(path, '/') != path) {
+ parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+ if (!parent_path)
+ return ERR_PTR(-ENOMEM);
+ strlcpy(parent_path, path, parent_path_len);
+ }
+ parent = of_find_node_by_path(parent_path);
+ if (!parent)
+ return ERR_PTR(-EINVAL);
+ if (strcmp(parent_path, "/"))
+ kfree(parent_path);
+ return parent;
+}
+
+static struct notifier_block *pSeries_reconfig_chain;
+
+int pSeries_reconfig_notifier_register(struct notifier_block *nb)
+{
+ return notifier_chain_register(&pSeries_reconfig_chain, nb);
+}
+
+void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+ notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+}
+
+static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
+{
+ struct device_node *np;
+ int err = -ENOMEM;
+
+ np = kzalloc(sizeof(*np), GFP_KERNEL);
+ if (!np)
+ goto out_err;
+
+ np->full_name = kmalloc(strlen(path) + 1, GFP_KERNEL);
+ if (!np->full_name)
+ goto out_err;
+
+ strcpy(np->full_name, path);
+
+ np->properties = proplist;
+ OF_MARK_DYNAMIC(np);
+ kref_init(&np->kref);
+
+ np->parent = derive_parent(path);
+ if (IS_ERR(np->parent)) {
+ err = PTR_ERR(np->parent);
+ goto out_err;
+ }
+
+ err = notifier_call_chain(&pSeries_reconfig_chain,
+ PSERIES_RECONFIG_ADD, np);
+ if (err == NOTIFY_BAD) {
+ printk(KERN_ERR "Failed to add device node %s\n", path);
+ err = -ENOMEM; /* For now, safe to assume kmalloc failure */
+ goto out_err;
+ }
+
+ of_attach_node(np);
+
+ add_node_proc_entries(np);
+
+ of_node_put(np->parent);
+
+ return 0;
+
+out_err:
+ if (np) {
+ of_node_put(np->parent);
+ kfree(np->full_name);
+ kfree(np);
+ }
+ return err;
+}
+
+static int pSeries_reconfig_remove_node(struct device_node *np)
+{
+ struct device_node *parent, *child;
+
+ parent = of_get_parent(np);
+ if (!parent)
+ return -EINVAL;
+
+ if ((child = of_get_next_child(np, NULL))) {
+ of_node_put(child);
+ return -EBUSY;
+ }
+
+ remove_node_proc_entries(np);
+
+ notifier_call_chain(&pSeries_reconfig_chain,
+ PSERIES_RECONFIG_REMOVE, np);
+ of_detach_node(np);
+
+ of_node_put(parent);
+ of_node_put(np); /* Must decrement the refcount */
+ return 0;
+}
+
+/*
+ * /proc/ppc64/ofdt - yucky binary interface for adding and removing
+ * OF device nodes. Should be deprecated as soon as we get an
+ * in-kernel wrapper for the RTAS ibm,configure-connector call.
+ */
+
+static void release_prop_list(const struct property *prop)
+{
+ struct property *next;
+ for (; prop; prop = next) {
+ next = prop->next;
+ kfree(prop->name);
+ kfree(prop->value);
+ kfree(prop);
+ }
+
+}
+
+/**
+ * parse_next_property - process the next property from raw input buffer
+ * @buf: input buffer, must be nul-terminated
+ * @end: end of the input buffer + 1, for validation
+ * @name: return value; set to property name in buf
+ * @length: return value; set to length of value
+ * @value: return value; set to the property value in buf
+ *
+ * Note that the caller must make copies of the name and value returned,
+ * this function does no allocation or copying of the data. Return value
+ * is set to the next name in buf, or NULL on error.
+ */
+static char * parse_next_property(char *buf, char *end, char **name, int *length,
+ unsigned char **value)
+{
+ char *tmp;
+
+ *name = buf;
+
+ tmp = strchr(buf, ' ');
+ if (!tmp) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __FUNCTION__, __LINE__);
+ return NULL;
+ }
+ *tmp = '\0';
+
+ if (++tmp >= end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __FUNCTION__, __LINE__);
+ return NULL;
+ }
+
+ /* now we're on the length */
+ *length = -1;
+ *length = simple_strtoul(tmp, &tmp, 10);
+ if (*length == -1) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __FUNCTION__, __LINE__);
+ return NULL;
+ }
+ if (*tmp != ' ' || ++tmp >= end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __FUNCTION__, __LINE__);
+ return NULL;
+ }
+
+ /* now we're on the value */
+ *value = tmp;
+ tmp += *length;
+ if (tmp > end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __FUNCTION__, __LINE__);
+ return NULL;
+ }
+ else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __FUNCTION__, __LINE__);
+ return NULL;
+ }
+ tmp++;
+
+ /* and now we should be on the next name, or the end */
+ return tmp;
+}
+
+static struct property *new_property(const char *name, const int length,
+ const unsigned char *value, struct property *last)
+{
+ struct property *new = kmalloc(sizeof(*new), GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+ memset(new, 0, sizeof(*new));
+
+ if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL)))
+ goto cleanup;
+ if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
+ goto cleanup;
+
+ strcpy(new->name, name);
+ memcpy(new->value, value, length);
+ *(((char *)new->value) + length) = 0;
+ new->length = length;
+ new->next = last;
+ return new;
+
+cleanup:
+ kfree(new->name);
+ kfree(new->value);
+ kfree(new);
+ return NULL;
+}
+
+static int do_add_node(char *buf, size_t bufsize)
+{
+ char *path, *end, *name;
+ struct device_node *np;
+ struct property *prop = NULL;
+ unsigned char* value;
+ int length, rv = 0;
+
+ end = buf + bufsize;
+ path = buf;
+ buf = strchr(buf, ' ');
+ if (!buf)
+ return -EINVAL;
+ *buf = '\0';
+ buf++;
+
+ if ((np = of_find_node_by_path(path))) {
+ of_node_put(np);
+ return -EINVAL;
+ }
+
+ /* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
+ while (buf < end &&
+ (buf = parse_next_property(buf, end, &name, &length, &value))) {
+ struct property *last = prop;
+
+ prop = new_property(name, length, value, last);
+ if (!prop) {
+ rv = -ENOMEM;
+ prop = last;
+ goto out;
+ }
+ }
+ if (!buf) {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ rv = pSeries_reconfig_add_node(path, prop);
+
+out:
+ if (rv)
+ release_prop_list(prop);
+ return rv;
+}
+
+static int do_remove_node(char *buf)
+{
+ struct device_node *node;
+ int rv = -ENODEV;
+
+ if ((node = of_find_node_by_path(buf)))
+ rv = pSeries_reconfig_remove_node(node);
+
+ of_node_put(node);
+ return rv;
+}
+
+/**
+ * ofdt_write - perform operations on the Open Firmware device tree
+ *
+ * @file: not used
+ * @buf: command and arguments
+ * @count: size of the command buffer
+ * @off: not used
+ *
+ * Operations supported at this time are addition and removal of
+ * whole nodes along with their properties. Operations on individual
+ * properties are not implemented (yet).
+ */
+static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
+ loff_t *off)
+{
+ int rv = 0;
+ char *kbuf;
+ char *tmp;
+
+ if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) {
+ rv = -ENOMEM;
+ goto out;
+ }
+ if (copy_from_user(kbuf, buf, count)) {
+ rv = -EFAULT;
+ goto out;
+ }
+
+ kbuf[count] = '\0';
+
+ tmp = strchr(kbuf, ' ');
+ if (!tmp) {
+ rv = -EINVAL;
+ goto out;
+ }
+ *tmp = '\0';
+ tmp++;
+
+ if (!strcmp(kbuf, "add_node"))
+ rv = do_add_node(tmp, count - (tmp - kbuf));
+ else if (!strcmp(kbuf, "remove_node"))
+ rv = do_remove_node(tmp);
+ else
+ rv = -EINVAL;
+out:
+ kfree(kbuf);
+ return rv ? rv : count;
+}
+
+static struct file_operations ofdt_fops = {
+ .write = ofdt_write
+};
+
+/* create /proc/ppc64/ofdt write-only by root */
+static int proc_ppc64_create_ofdt(void)
+{
+ struct proc_dir_entry *ent;
+
+ if (!platform_is_pseries())
+ return 0;
+
+ ent = create_proc_entry("ppc64/ofdt", S_IWUSR, NULL);
+ if (ent) {
+ ent->nlink = 1;
+ ent->data = NULL;
+ ent->size = 0;
+ ent->proc_fops = &ofdt_fops;
+ }
+
+ return 0;
+}
+__initcall(proc_ppc64_create_ofdt);
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
new file mode 100644
index 000000000000..a6f628d4c9dc
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -0,0 +1,528 @@
+/*
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Communication to userspace based on kernel/printk.c
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/nvram.h>
+#include <asm/atomic.h>
+
+#if 0
+#define DEBUG(A...) printk(KERN_ERR A)
+#else
+#define DEBUG(A...)
+#endif
+
+static DEFINE_SPINLOCK(rtasd_log_lock);
+
+DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
+
+static char *rtas_log_buf;
+static unsigned long rtas_log_start;
+static unsigned long rtas_log_size;
+
+static int surveillance_timeout = -1;
+static unsigned int rtas_event_scan_rate;
+static unsigned int rtas_error_log_max;
+static unsigned int rtas_error_log_buffer_max;
+
+static int full_rtas_msgs = 0;
+
+extern int no_logging;
+
+volatile int error_log_cnt = 0;
+
+/*
+ * Since we use 32 bit RTAS, the physical address of this must be below
+ * 4G or else bad things happen. Allocate this in the kernel data and
+ * make it big enough.
+ */
+static unsigned char logdata[RTAS_ERROR_LOG_MAX];
+
+static int get_eventscan_parms(void);
+
+static char *rtas_type[] = {
+ "Unknown", "Retry", "TCE Error", "Internal Device Failure",
+ "Timeout", "Data Parity", "Address Parity", "Cache Parity",
+ "Address Invalid", "ECC Uncorrected", "ECC Corrupted",
+};
+
+static char *rtas_event_type(int type)
+{
+ if ((type > 0) && (type < 11))
+ return rtas_type[type];
+
+ switch (type) {
+ case RTAS_TYPE_EPOW:
+ return "EPOW";
+ case RTAS_TYPE_PLATFORM:
+ return "Platform Error";
+ case RTAS_TYPE_IO:
+ return "I/O Event";
+ case RTAS_TYPE_INFO:
+ return "Platform Information Event";
+ case RTAS_TYPE_DEALLOC:
+ return "Resource Deallocation Event";
+ case RTAS_TYPE_DUMP:
+ return "Dump Notification Event";
+ }
+
+ return rtas_type[0];
+}
+
+/* To see this info, grep RTAS /var/log/messages and each entry
+ * will be collected together with obvious begin/end.
+ * There will be a unique identifier on the begin and end lines.
+ * This will persist across reboots.
+ *
+ * format of error logs returned from RTAS:
+ * bytes (size) : contents
+ * --------------------------------------------------------
+ * 0-7 (8) : rtas_error_log
+ * 8-47 (40) : extended info
+ * 48-51 (4) : vendor id
+ * 52-1023 (vendor specific) : location code and debug data
+ */
+static void printk_log_rtas(char *buf, int len)
+{
+
+ int i,j,n = 0;
+ int perline = 16;
+ char buffer[64];
+ char * str = "RTAS event";
+
+ if (full_rtas_msgs) {
+ printk(RTAS_DEBUG "%d -------- %s begin --------\n",
+ error_log_cnt, str);
+
+ /*
+ * Print perline bytes on each line, each line will start
+ * with RTAS and a changing number, so syslogd will
+ * print lines that are otherwise the same. Separate every
+ * 4 bytes with a space.
+ */
+ for (i = 0; i < len; i++) {
+ j = i % perline;
+ if (j == 0) {
+ memset(buffer, 0, sizeof(buffer));
+ n = sprintf(buffer, "RTAS %d:", i/perline);
+ }
+
+ if ((i % 4) == 0)
+ n += sprintf(buffer+n, " ");
+
+ n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
+
+ if (j == (perline-1))
+ printk(KERN_DEBUG "%s\n", buffer);
+ }
+ if ((i % perline) != 0)
+ printk(KERN_DEBUG "%s\n", buffer);
+
+ printk(RTAS_DEBUG "%d -------- %s end ----------\n",
+ error_log_cnt, str);
+ } else {
+ struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
+
+ printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
+ error_log_cnt, rtas_event_type(errlog->type),
+ errlog->severity);
+ }
+}
+
+static int log_rtas_len(char * buf)
+{
+ int len;
+ struct rtas_error_log *err;
+
+ /* rtas fixed header */
+ len = 8;
+ err = (struct rtas_error_log *)buf;
+ if (err->extended_log_length) {
+
+ /* extended header */
+ len += err->extended_log_length;
+ }
+
+ if (rtas_error_log_max == 0) {
+ get_eventscan_parms();
+ }
+ if (len > rtas_error_log_max)
+ len = rtas_error_log_max;
+
+ return len;
+}
+
+/*
+ * First write to nvram, if fatal error, that is the only
+ * place we log the info. The error will be picked up
+ * on the next reboot by rtasd. If not fatal, run the
+ * method for the type of error. Currently, only RTAS
+ * errors have methods implemented, but in the future
+ * there might be a need to store data in nvram before a
+ * call to panic().
+ *
+ * XXX We write to nvram periodically, to indicate error has
+ * been written and sync'd, but there is a possibility
+ * that if we don't shutdown correctly, a duplicate error
+ * record will be created on next reboot.
+ */
+void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
+{
+ unsigned long offset;
+ unsigned long s;
+ int len = 0;
+
+ DEBUG("logging event\n");
+ if (buf == NULL)
+ return;
+
+ spin_lock_irqsave(&rtasd_log_lock, s);
+
+ /* get length and increase count */
+ switch (err_type & ERR_TYPE_MASK) {
+ case ERR_TYPE_RTAS_LOG:
+ len = log_rtas_len(buf);
+ if (!(err_type & ERR_FLAG_BOOT))
+ error_log_cnt++;
+ break;
+ case ERR_TYPE_KERNEL_PANIC:
+ default:
+ spin_unlock_irqrestore(&rtasd_log_lock, s);
+ return;
+ }
+
+ /* Write error to NVRAM */
+ if (!no_logging && !(err_type & ERR_FLAG_BOOT))
+ nvram_write_error_log(buf, len, err_type);
+
+ /*
+ * rtas errors can occur during boot, and we do want to capture
+ * those somewhere, even if nvram isn't ready (why not?), and even
+ * if rtasd isn't ready. Put them into the boot log, at least.
+ */
+ if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
+ printk_log_rtas(buf, len);
+
+ /* Check to see if we need to or have stopped logging */
+ if (fatal || no_logging) {
+ no_logging = 1;
+ spin_unlock_irqrestore(&rtasd_log_lock, s);
+ return;
+ }
+
+ /* call type specific method for error */
+ switch (err_type & ERR_TYPE_MASK) {
+ case ERR_TYPE_RTAS_LOG:
+ offset = rtas_error_log_buffer_max *
+ ((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
+
+ /* First copy over sequence number */
+ memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
+
+ /* Second copy over error log data */
+ offset += sizeof(int);
+ memcpy(&rtas_log_buf[offset], buf, len);
+
+ if (rtas_log_size < LOG_NUMBER)
+ rtas_log_size += 1;
+ else
+ rtas_log_start += 1;
+
+ spin_unlock_irqrestore(&rtasd_log_lock, s);
+ wake_up_interruptible(&rtas_log_wait);
+ break;
+ case ERR_TYPE_KERNEL_PANIC:
+ default:
+ spin_unlock_irqrestore(&rtasd_log_lock, s);
+ return;
+ }
+
+}
+
+
+static int rtas_log_open(struct inode * inode, struct file * file)
+{
+ return 0;
+}
+
+static int rtas_log_release(struct inode * inode, struct file * file)
+{
+ return 0;
+}
+
+/* This will check if all events are logged, if they are then, we
+ * know that we can safely clear the events in NVRAM.
+ * Next we'll sit and wait for something else to log.
+ */
+static ssize_t rtas_log_read(struct file * file, char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ int error;
+ char *tmp;
+ unsigned long s;
+ unsigned long offset;
+
+ if (!buf || count < rtas_error_log_buffer_max)
+ return -EINVAL;
+
+ count = rtas_error_log_buffer_max;
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ tmp = kmalloc(count, GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+
+ spin_lock_irqsave(&rtasd_log_lock, s);
+ /* if it's 0, then we know we got the last one (the one in NVRAM) */
+ if (rtas_log_size == 0 && !no_logging)
+ nvram_clear_error_log();
+ spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+
+ error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
+ if (error)
+ goto out;
+
+ spin_lock_irqsave(&rtasd_log_lock, s);
+ offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
+ memcpy(tmp, &rtas_log_buf[offset], count);
+
+ rtas_log_start += 1;
+ rtas_log_size -= 1;
+ spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+ error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
+out:
+ kfree(tmp);
+ return error;
+}
+
+static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
+{
+ poll_wait(file, &rtas_log_wait, wait);
+ if (rtas_log_size)
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+struct file_operations proc_rtas_log_operations = {
+ .read = rtas_log_read,
+ .poll = rtas_log_poll,
+ .open = rtas_log_open,
+ .release = rtas_log_release,
+};
+
+static int enable_surveillance(int timeout)
+{
+ int error;
+
+ error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
+
+ if (error == 0)
+ return 0;
+
+ if (error == -EINVAL) {
+ printk(KERN_INFO "rtasd: surveillance not supported\n");
+ return 0;
+ }
+
+ printk(KERN_ERR "rtasd: could not update surveillance\n");
+ return -1;
+}
+
+static int get_eventscan_parms(void)
+{
+ struct device_node *node;
+ int *ip;
+
+ node = of_find_node_by_path("/rtas");
+
+ ip = (int *)get_property(node, "rtas-event-scan-rate", NULL);
+ if (ip == NULL) {
+ printk(KERN_ERR "rtasd: no rtas-event-scan-rate\n");
+ of_node_put(node);
+ return -1;
+ }
+ rtas_event_scan_rate = *ip;
+ DEBUG("rtas-event-scan-rate %d\n", rtas_event_scan_rate);
+
+ /* Make room for the sequence number */
+ rtas_error_log_max = rtas_get_error_log_max();
+ rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
+
+ of_node_put(node);
+
+ return 0;
+}
+
+static void do_event_scan(int event_scan)
+{
+ int error;
+ do {
+ memset(logdata, 0, rtas_error_log_max);
+ error = rtas_call(event_scan, 4, 1, NULL,
+ RTAS_EVENT_SCAN_ALL_EVENTS, 0,
+ __pa(logdata), rtas_error_log_max);
+ if (error == -1) {
+ printk(KERN_ERR "event-scan failed\n");
+ break;
+ }
+
+ if (error == 0)
+ pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+
+ } while(error == 0);
+}
+
+static void do_event_scan_all_cpus(long delay)
+{
+ int cpu;
+
+ lock_cpu_hotplug();
+ cpu = first_cpu(cpu_online_map);
+ for (;;) {
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ do_event_scan(rtas_token("event-scan"));
+ set_cpus_allowed(current, CPU_MASK_ALL);
+
+ /* Drop hotplug lock, and sleep for the specified delay */
+ unlock_cpu_hotplug();
+ msleep_interruptible(delay);
+ lock_cpu_hotplug();
+
+ cpu = next_cpu(cpu, cpu_online_map);
+ if (cpu == NR_CPUS)
+ break;
+ }
+ unlock_cpu_hotplug();
+}
+
+static int rtasd(void *unused)
+{
+ unsigned int err_type;
+ int event_scan = rtas_token("event-scan");
+ int rc;
+
+ daemonize("rtasd");
+
+ if (event_scan == RTAS_UNKNOWN_SERVICE || get_eventscan_parms() == -1)
+ goto error;
+
+ rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
+ if (!rtas_log_buf) {
+ printk(KERN_ERR "rtasd: no memory\n");
+ goto error;
+ }
+
+ printk(KERN_INFO "RTAS daemon started\n");
+
+ DEBUG("will sleep for %d milliseconds\n", (30000/rtas_event_scan_rate));
+
+ /* See if we have any error stored in NVRAM */
+ memset(logdata, 0, rtas_error_log_max);
+
+ rc = nvram_read_error_log(logdata, rtas_error_log_max, &err_type);
+
+ /* We can use rtas_log_buf now */
+ no_logging = 0;
+
+ if (!rc) {
+ if (err_type != ERR_FLAG_ALREADY_LOGGED) {
+ pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
+ }
+ }
+
+ /* First pass. */
+ do_event_scan_all_cpus(1000);
+
+ if (surveillance_timeout != -1) {
+ DEBUG("enabling surveillance\n");
+ enable_surveillance(surveillance_timeout);
+ DEBUG("surveillance enabled\n");
+ }
+
+ /* Delay should be at least one second since some
+ * machines have problems if we call event-scan too
+ * quickly. */
+ for (;;)
+ do_event_scan_all_cpus(30000/rtas_event_scan_rate);
+
+error:
+ /* Should delete proc entries */
+ return -EINVAL;
+}
+
+static int __init rtas_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ if (!platform_is_pseries())
+ return 0;
+
+ /* No RTAS */
+ if (rtas_token("event-scan") == RTAS_UNKNOWN_SERVICE) {
+ printk(KERN_INFO "rtasd: no event-scan on system\n");
+ return 1;
+ }
+
+ entry = create_proc_entry("ppc64/rtas/error_log", S_IRUSR, NULL);
+ if (entry)
+ entry->proc_fops = &proc_rtas_log_operations;
+ else
+ printk(KERN_ERR "Failed to create error_log proc entry\n");
+
+ if (kernel_thread(rtasd, NULL, CLONE_FS) < 0)
+ printk(KERN_ERR "Failed to start RTAS daemon\n");
+
+ return 0;
+}
+
+static int __init surveillance_setup(char *str)
+{
+ int i;
+
+ if (get_option(&str,&i)) {
+ if (i >= 0 && i <= 255)
+ surveillance_timeout = i;
+ }
+
+ return 1;
+}
+
+static int __init rtasmsgs_setup(char *str)
+{
+ if (strcmp(str, "on") == 0)
+ full_rtas_msgs = 1;
+ else if (strcmp(str, "off") == 0)
+ full_rtas_msgs = 0;
+
+ return 1;
+}
+__initcall(rtas_init);
+__setup("surveillance=", surveillance_setup);
+__setup("rtasmsgs=", rtasmsgs_setup);
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
new file mode 100644
index 000000000000..2edc947f7c44
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -0,0 +1,235 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com>
+ *
+ * When ppc64 hardware fails the service processor dumps internal state
+ * of the system. After a reboot the operating system can access a dump
+ * of this data using this driver. A dump exists if the device-tree
+ * /chosen/ibm,scan-log-data property exists.
+ *
+ * This driver exports /proc/ppc64/scan-log-dump which can be read.
+ * The driver supports only sequential reads.
+ *
+ * The driver looks at a write to the driver for the single word "reset".
+ * If given, the driver will reset the scanlog so the platform can free it.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "scanlog"
+
+/* Status returns from ibm,scan-log-dump */
+#define SCANLOG_COMPLETE 0
+#define SCANLOG_HWERROR -1
+#define SCANLOG_CONTINUE 1
+
+#define DEBUG(A...) do { if (scanlog_debug) printk(KERN_ERR "scanlog: " A); } while (0)
+
+static int scanlog_debug;
+static unsigned int ibm_scan_log_dump; /* RTAS token */
+static struct proc_dir_entry *proc_ppc64_scan_log_dump; /* The proc file */
+
+static ssize_t scanlog_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file->f_dentry->d_inode;
+ struct proc_dir_entry *dp;
+ unsigned int *data;
+ int status;
+ unsigned long len, off;
+ unsigned int wait_time;
+
+ dp = PDE(inode);
+ data = (unsigned int *)dp->data;
+
+ if (!data) {
+ printk(KERN_ERR "scanlog: read failed no data\n");
+ return -EIO;
+ }
+
+ if (count > RTAS_DATA_BUF_SIZE)
+ count = RTAS_DATA_BUF_SIZE;
+
+ if (count < 1024) {
+ /* This is the min supported by this RTAS call. Rather
+ * than do all the buffering we insist the user code handle
+ * larger reads. As long as cp works... :)
+ */
+ printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
+ return -EINVAL;
+ }
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ for (;;) {
+ wait_time = 500; /* default wait if no data */
+ spin_lock(&rtas_data_buf_lock);
+ memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
+ status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
+ (u32) __pa(rtas_data_buf), (u32) count);
+ memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+ spin_unlock(&rtas_data_buf_lock);
+
+ DEBUG("status=%d, data[0]=%x, data[1]=%x, data[2]=%x\n",
+ status, data[0], data[1], data[2]);
+ switch (status) {
+ case SCANLOG_COMPLETE:
+ DEBUG("hit eof\n");
+ return 0;
+ case SCANLOG_HWERROR:
+ DEBUG("hardware error reading scan log data\n");
+ return -EIO;
+ case SCANLOG_CONTINUE:
+ /* We may or may not have data yet */
+ len = data[1];
+ off = data[2];
+ if (len > 0) {
+ if (copy_to_user(buf, ((char *)data)+off, len))
+ return -EFAULT;
+ return len;
+ }
+ /* Break to sleep default time */
+ break;
+ default:
+ if (status > 9900 && status <= 9905) {
+ wait_time = rtas_extended_busy_delay_time(status);
+ } else {
+ printk(KERN_ERR "scanlog: unknown error from rtas: %d\n", status);
+ return -EIO;
+ }
+ }
+ /* Apparently no data yet. Wait and try again. */
+ msleep_interruptible(wait_time);
+ }
+ /*NOTREACHED*/
+}
+
+static ssize_t scanlog_write(struct file * file, const char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ char stkbuf[20];
+ int status;
+
+ if (count > 19) count = 19;
+ if (copy_from_user (stkbuf, buf, count)) {
+ return -EFAULT;
+ }
+ stkbuf[count] = 0;
+
+ if (buf) {
+ if (strncmp(stkbuf, "reset", 5) == 0) {
+ DEBUG("reset scanlog\n");
+ status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
+ DEBUG("rtas returns %d\n", status);
+ } else if (strncmp(stkbuf, "debugon", 7) == 0) {
+ printk(KERN_ERR "scanlog: debug on\n");
+ scanlog_debug = 1;
+ } else if (strncmp(stkbuf, "debugoff", 8) == 0) {
+ printk(KERN_ERR "scanlog: debug off\n");
+ scanlog_debug = 0;
+ }
+ }
+ return count;
+}
+
+static int scanlog_open(struct inode * inode, struct file * file)
+{
+ struct proc_dir_entry *dp = PDE(inode);
+ unsigned int *data = (unsigned int *)dp->data;
+
+ if (!data) {
+ printk(KERN_ERR "scanlog: open failed no data\n");
+ return -EIO;
+ }
+
+ if (data[0] != 0) {
+ /* This imperfect test stops a second copy of the
+ * data (or a reset while data is being copied)
+ */
+ return -EBUSY;
+ }
+
+ data[0] = 0; /* re-init so we restart the scan */
+
+ return 0;
+}
+
+static int scanlog_release(struct inode * inode, struct file * file)
+{
+ struct proc_dir_entry *dp = PDE(inode);
+ unsigned int *data = (unsigned int *)dp->data;
+
+ if (!data) {
+ printk(KERN_ERR "scanlog: release failed no data\n");
+ return -EIO;
+ }
+ data[0] = 0;
+
+ return 0;
+}
+
+struct file_operations scanlog_fops = {
+ .owner = THIS_MODULE,
+ .read = scanlog_read,
+ .write = scanlog_write,
+ .open = scanlog_open,
+ .release = scanlog_release,
+};
+
+int __init scanlog_init(void)
+{
+ struct proc_dir_entry *ent;
+
+ ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
+ if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE) {
+ printk(KERN_ERR "scan-log-dump not implemented on this system\n");
+ return -EIO;
+ }
+
+ ent = create_proc_entry("ppc64/rtas/scan-log-dump", S_IRUSR, NULL);
+ if (ent) {
+ ent->proc_fops = &scanlog_fops;
+ /* Ideally we could allocate a buffer < 4G */
+ ent->data = kmalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!ent->data) {
+ printk(KERN_ERR "Failed to allocate a buffer\n");
+ remove_proc_entry("scan-log-dump", ent->parent);
+ return -ENOMEM;
+ }
+ ((unsigned int *)ent->data)[0] = 0;
+ } else {
+ printk(KERN_ERR "Failed to create ppc64/scan-log-dump proc entry\n");
+ return -EIO;
+ }
+ proc_ppc64_scan_log_dump = ent;
+
+ return 0;
+}
+
+void __exit scanlog_cleanup(void)
+{
+ if (proc_ppc64_scan_log_dump) {
+ kfree(proc_ppc64_scan_log_dump->data);
+ remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent);
+ }
+}
+
+module_init(scanlog_init);
+module_exit(scanlog_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
new file mode 100644
index 000000000000..4a465f067ede
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -0,0 +1,642 @@
+/*
+ * 64-bit pSeries and RS/6000 setup code.
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ * Adapted from 'alpha' version by Gary Thomas
+ * Modified by Cort Dougan (cort@cs.nmt.edu)
+ * Modified by PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/tty.h>
+#include <linux/major.h>
+#include <linux/interrupt.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/adb.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+
+#include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include "xics.h"
+#include <asm/firmware.h>
+#include <asm/pmc.h>
+#include <asm/mpic.h>
+#include <asm/ppc-pci.h>
+#include <asm/i8259.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+
+#include "plpar_wrappers.h"
+
+#ifdef DEBUG
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+extern void find_udbg_vterm(void);
+extern void system_reset_fwnmi(void); /* from head.S */
+extern void machine_check_fwnmi(void); /* from head.S */
+extern void generic_find_legacy_serial_ports(u64 *physport,
+ unsigned int *default_speed);
+
+int fwnmi_active; /* TRUE if an FWNMI handler is present */
+
+extern void pSeries_system_reset_exception(struct pt_regs *regs);
+extern int pSeries_machine_check_exception(struct pt_regs *regs);
+
+static void pseries_shared_idle(void);
+static void pseries_dedicated_idle(void);
+
+struct mpic *pSeries_mpic;
+
+void pSeries_show_cpuinfo(struct seq_file *m)
+{
+ struct device_node *root;
+ const char *model = "";
+
+ root = of_find_node_by_path("/");
+ if (root)
+ model = get_property(root, "model", NULL);
+ seq_printf(m, "machine\t\t: CHRP %s\n", model);
+ of_node_put(root);
+}
+
+/* Initialize firmware assisted non-maskable interrupts if
+ * the firmware supports this feature.
+ *
+ */
+static void __init fwnmi_init(void)
+{
+ int ret;
+ int ibm_nmi_register = rtas_token("ibm,nmi-register");
+ if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+ return;
+ ret = rtas_call(ibm_nmi_register, 2, 1, NULL,
+ __pa((unsigned long)system_reset_fwnmi),
+ __pa((unsigned long)machine_check_fwnmi));
+ if (ret == 0)
+ fwnmi_active = 1;
+}
+
+static void __init pSeries_init_mpic(void)
+{
+ unsigned int *addrp;
+ struct device_node *np;
+ unsigned long intack = 0;
+
+ /* All ISUs are setup, complete initialization */
+ mpic_init(pSeries_mpic);
+
+ /* Check what kind of cascade ACK we have */
+ if (!(np = of_find_node_by_name(NULL, "pci"))
+ || !(addrp = (unsigned int *)
+ get_property(np, "8259-interrupt-acknowledge", NULL)))
+ printk(KERN_ERR "Cannot find pci to get ack address\n");
+ else
+ intack = addrp[prom_n_addr_cells(np)-1];
+ of_node_put(np);
+
+ /* Setup the legacy interrupts & controller */
+ i8259_init(intack, 0);
+
+ /* Hook cascade to mpic */
+ mpic_setup_cascade(NUM_ISA_INTERRUPTS, i8259_irq_cascade, NULL);
+}
+
+static void __init pSeries_setup_mpic(void)
+{
+ unsigned int *opprop;
+ unsigned long openpic_addr = 0;
+ unsigned char senses[NR_IRQS - NUM_ISA_INTERRUPTS];
+ struct device_node *root;
+ int irq_count;
+
+ /* Find the Open PIC if present */
+ root = of_find_node_by_path("/");
+ opprop = (unsigned int *) get_property(root, "platform-open-pic", NULL);
+ if (opprop != 0) {
+ int n = prom_n_addr_cells(root);
+
+ for (openpic_addr = 0; n > 0; --n)
+ openpic_addr = (openpic_addr << 32) + *opprop++;
+ printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
+ }
+ of_node_put(root);
+
+ BUG_ON(openpic_addr == 0);
+
+ /* Get the sense values from OF */
+ prom_get_irq_senses(senses, NUM_ISA_INTERRUPTS, NR_IRQS);
+
+ /* Setup the openpic driver */
+ irq_count = NR_IRQS - NUM_ISA_INTERRUPTS - 4; /* leave room for IPIs */
+ pSeries_mpic = mpic_alloc(openpic_addr, MPIC_PRIMARY,
+ 16, 16, irq_count, /* isu size, irq offset, irq count */
+ NR_IRQS - 4, /* ipi offset */
+ senses, irq_count, /* sense & sense size */
+ " MPIC ");
+}
+
+static void pseries_lpar_enable_pmcs(void)
+{
+ unsigned long set, reset;
+
+ power4_enable_pmcs();
+
+ set = 1UL << 63;
+ reset = 0;
+ plpar_hcall_norets(H_PERFMON, set, reset);
+
+ /* instruct hypervisor to maintain PMCs */
+ if (firmware_has_feature(FW_FEATURE_SPLPAR))
+ get_paca()->lppaca.pmcregs_in_use = 1;
+}
+
+static void __init pSeries_setup_arch(void)
+{
+ /* Fixup ppc_md depending on the type of interrupt controller */
+ if (ppc64_interrupt_controller == IC_OPEN_PIC) {
+ ppc_md.init_IRQ = pSeries_init_mpic;
+ ppc_md.get_irq = mpic_get_irq;
+ /* Allocate the mpic now, so that find_and_init_phbs() can
+ * fill the ISUs */
+ pSeries_setup_mpic();
+ } else {
+ ppc_md.init_IRQ = xics_init_IRQ;
+ ppc_md.get_irq = xics_get_irq;
+ }
+
+#ifdef CONFIG_SMP
+ smp_init_pSeries();
+#endif
+ /* openpic global configuration register (64-bit format). */
+ /* openpic Interrupt Source Unit pointer (64-bit format). */
+ /* python0 facility area (mmio) (64-bit format) REAL address. */
+
+ /* init to some ~sane value until calibrate_delay() runs */
+ loops_per_jiffy = 50000000;
+
+ if (ROOT_DEV == 0) {
+ printk("No ramdisk, default root is /dev/sda2\n");
+ ROOT_DEV = Root_SDA2;
+ }
+
+ fwnmi_init();
+
+ /* Find and initialize PCI host bridges */
+ init_pci_config_tokens();
+ find_and_init_phbs();
+ eeh_init();
+
+ pSeries_nvram_init();
+
+ /* Choose an idle loop */
+ if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ vpa_init(boot_cpuid);
+ if (get_paca()->lppaca.shared_proc) {
+ printk(KERN_INFO "Using shared processor idle loop\n");
+ ppc_md.idle_loop = pseries_shared_idle;
+ } else {
+ printk(KERN_INFO "Using dedicated idle loop\n");
+ ppc_md.idle_loop = pseries_dedicated_idle;
+ }
+ } else {
+ printk(KERN_INFO "Using default idle loop\n");
+ ppc_md.idle_loop = default_idle;
+ }
+
+ if (platform_is_lpar())
+ ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
+ else
+ ppc_md.enable_pmcs = power4_enable_pmcs;
+}
+
+static int __init pSeries_init_panel(void)
+{
+ /* Manually leave the kernel version on the panel. */
+ ppc_md.progress("Linux ppc64\n", 0);
+ ppc_md.progress(system_utsname.version, 0);
+
+ return 0;
+}
+arch_initcall(pSeries_init_panel);
+
+
+/* Build up the ppc64_firmware_features bitmask field
+ * using contents of device-tree/ibm,hypertas-functions.
+ * Ultimately this functionality may be moved into prom.c prom_init().
+ */
+static void __init fw_feature_init(void)
+{
+ struct device_node * dn;
+ char * hypertas;
+ unsigned int len;
+
+ DBG(" -> fw_feature_init()\n");
+
+ ppc64_firmware_features = 0;
+ dn = of_find_node_by_path("/rtas");
+ if (dn == NULL) {
+ printk(KERN_ERR "WARNING ! Cannot find RTAS in device-tree !\n");
+ goto no_rtas;
+ }
+
+ hypertas = get_property(dn, "ibm,hypertas-functions", &len);
+ if (hypertas) {
+ while (len > 0){
+ int i, hypertas_len;
+ /* check value against table of strings */
+ for(i=0; i < FIRMWARE_MAX_FEATURES ;i++) {
+ if ((firmware_features_table[i].name) &&
+ (strcmp(firmware_features_table[i].name,hypertas))==0) {
+ /* we have a match */
+ ppc64_firmware_features |=
+ (firmware_features_table[i].val);
+ break;
+ }
+ }
+ hypertas_len = strlen(hypertas);
+ len -= hypertas_len +1;
+ hypertas+= hypertas_len +1;
+ }
+ }
+
+ of_node_put(dn);
+no_rtas:
+
+ DBG(" <- fw_feature_init()\n");
+}
+
+
+static void __init pSeries_discover_pic(void)
+{
+ struct device_node *np;
+ char *typep;
+
+ /*
+ * Setup interrupt mapping options that are needed for finish_device_tree
+ * to properly parse the OF interrupt tree & do the virtual irq mapping
+ */
+ __irq_offset_value = NUM_ISA_INTERRUPTS;
+ ppc64_interrupt_controller = IC_INVALID;
+ for (np = NULL; (np = of_find_node_by_name(np, "interrupt-controller"));) {
+ typep = (char *)get_property(np, "compatible", NULL);
+ if (strstr(typep, "open-pic"))
+ ppc64_interrupt_controller = IC_OPEN_PIC;
+ else if (strstr(typep, "ppc-xicp"))
+ ppc64_interrupt_controller = IC_PPC_XIC;
+ else
+ printk("pSeries_discover_pic: failed to recognize"
+ " interrupt-controller\n");
+ break;
+ }
+}
+
+static void pSeries_mach_cpu_die(void)
+{
+ local_irq_disable();
+ idle_task_exit();
+ /* Some hardware requires clearing the CPPR, while other hardware does not
+ * it is safe either way
+ */
+ pSeriesLP_cppr_info(0, 0);
+ rtas_stop_self();
+ /* Should never get here... */
+ BUG();
+ for(;;);
+}
+
+static int pseries_set_dabr(unsigned long dabr)
+{
+ return plpar_hcall_norets(H_SET_DABR, dabr);
+}
+
+static int pseries_set_xdabr(unsigned long dabr)
+{
+ /* We want to catch accesses from kernel and userspace */
+ return plpar_hcall_norets(H_SET_XDABR, dabr,
+ H_DABRX_KERNEL | H_DABRX_USER);
+}
+
+/*
+ * Early initialization. Relocation is on but do not reference unbolted pages
+ */
+static void __init pSeries_init_early(void)
+{
+ void *comport;
+ int iommu_off = 0;
+ unsigned int default_speed;
+ u64 physport;
+
+ DBG(" -> pSeries_init_early()\n");
+
+ fw_feature_init();
+
+ if (platform_is_lpar())
+ hpte_init_lpar();
+ else {
+ hpte_init_native();
+ iommu_off = (of_chosen &&
+ get_property(of_chosen, "linux,iommu-off", NULL));
+ }
+
+ generic_find_legacy_serial_ports(&physport, &default_speed);
+
+ if (platform_is_lpar())
+ find_udbg_vterm();
+ else if (physport) {
+ /* Map the uart for udbg. */
+ comport = (void *)ioremap(physport, 16);
+ udbg_init_uart(comport, default_speed);
+
+ DBG("Hello World !\n");
+ }
+
+ if (firmware_has_feature(FW_FEATURE_DABR))
+ ppc_md.set_dabr = pseries_set_dabr;
+ else if (firmware_has_feature(FW_FEATURE_XDABR))
+ ppc_md.set_dabr = pseries_set_xdabr;
+
+ iommu_init_early_pSeries();
+
+ pSeries_discover_pic();
+
+ DBG(" <- pSeries_init_early()\n");
+}
+
+
+static int pSeries_check_legacy_ioport(unsigned int baseport)
+{
+ struct device_node *np;
+
+#define I8042_DATA_REG 0x60
+#define FDC_BASE 0x3f0
+
+
+ switch(baseport) {
+ case I8042_DATA_REG:
+ np = of_find_node_by_type(NULL, "8042");
+ if (np == NULL)
+ return -ENODEV;
+ of_node_put(np);
+ break;
+ case FDC_BASE:
+ np = of_find_node_by_type(NULL, "fdc");
+ if (np == NULL)
+ return -ENODEV;
+ of_node_put(np);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+extern struct machdep_calls pSeries_md;
+
+static int __init pSeries_probe(int platform)
+{
+ if (platform != PLATFORM_PSERIES &&
+ platform != PLATFORM_PSERIES_LPAR)
+ return 0;
+
+ /* if we have some ppc_md fixups for LPAR to do, do
+ * it here ...
+ */
+
+ return 1;
+}
+
+DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
+
+static inline void dedicated_idle_sleep(unsigned int cpu)
+{
+ struct paca_struct *ppaca = &paca[cpu ^ 1];
+
+ /* Only sleep if the other thread is not idle */
+ if (!(ppaca->lppaca.idle)) {
+ local_irq_disable();
+
+ /*
+ * We are about to sleep the thread and so wont be polling any
+ * more.
+ */
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ smp_mb__after_clear_bit();
+
+ /*
+ * SMT dynamic mode. Cede will result in this thread going
+ * dormant, if the partner thread is still doing work. Thread
+ * wakes up if partner goes idle, an interrupt is presented, or
+ * a prod occurs. Returning from the cede enables external
+ * interrupts.
+ */
+ if (!need_resched())
+ cede_processor();
+ else
+ local_irq_enable();
+ set_thread_flag(TIF_POLLING_NRFLAG);
+ } else {
+ /*
+ * Give the HV an opportunity at the processor, since we are
+ * not doing any work.
+ */
+ poll_pending();
+ }
+}
+
+static void pseries_dedicated_idle(void)
+{
+ struct paca_struct *lpaca = get_paca();
+ unsigned int cpu = smp_processor_id();
+ unsigned long start_snooze;
+ unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
+ set_thread_flag(TIF_POLLING_NRFLAG);
+
+ while (1) {
+ /*
+ * Indicate to the HV that we are idle. Now would be
+ * a good time to find other work to dispatch.
+ */
+ lpaca->lppaca.idle = 1;
+
+ if (!need_resched()) {
+ start_snooze = get_tb() +
+ *smt_snooze_delay * tb_ticks_per_usec;
+
+ while (!need_resched() && !cpu_is_offline(cpu)) {
+ ppc64_runlatch_off();
+
+ /*
+ * Go into low thread priority and possibly
+ * low power mode.
+ */
+ HMT_low();
+ HMT_very_low();
+
+ if (*smt_snooze_delay != 0 &&
+ get_tb() > start_snooze) {
+ HMT_medium();
+ dedicated_idle_sleep(cpu);
+ }
+
+ }
+
+ HMT_medium();
+ }
+
+ lpaca->lppaca.idle = 0;
+ ppc64_runlatch_on();
+
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+
+ if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+ cpu_die();
+ }
+}
+
+static void pseries_shared_idle(void)
+{
+ struct paca_struct *lpaca = get_paca();
+ unsigned int cpu = smp_processor_id();
+
+ while (1) {
+ /*
+ * Indicate to the HV that we are idle. Now would be
+ * a good time to find other work to dispatch.
+ */
+ lpaca->lppaca.idle = 1;
+
+ while (!need_resched() && !cpu_is_offline(cpu)) {
+ local_irq_disable();
+ ppc64_runlatch_off();
+
+ /*
+ * Yield the processor to the hypervisor. We return if
+ * an external interrupt occurs (which are driven prior
+ * to returning here) or if a prod occurs from another
+ * processor. When returning here, external interrupts
+ * are enabled.
+ *
+ * Check need_resched() again with interrupts disabled
+ * to avoid a race.
+ */
+ if (!need_resched())
+ cede_processor();
+ else
+ local_irq_enable();
+
+ HMT_medium();
+ }
+
+ lpaca->lppaca.idle = 0;
+ ppc64_runlatch_on();
+
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+
+ if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+ cpu_die();
+ }
+}
+
+static int pSeries_pci_probe_mode(struct pci_bus *bus)
+{
+ if (platform_is_lpar())
+ return PCI_PROBE_DEVTREE;
+ return PCI_PROBE_NORMAL;
+}
+
+#ifdef CONFIG_KEXEC
+static void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+ /* Don't risk a hypervisor call if we're crashing */
+ if (!crash_shutdown) {
+ unsigned long vpa = __pa(&get_paca()->lppaca);
+
+ if (unregister_vpa(hard_smp_processor_id(), vpa)) {
+ printk("VPA deregistration of cpu %u (hw_cpu_id %d) "
+ "failed\n", smp_processor_id(),
+ hard_smp_processor_id());
+ }
+ }
+
+ if (ppc64_interrupt_controller == IC_OPEN_PIC)
+ mpic_teardown_this_cpu(secondary);
+ else
+ xics_teardown_cpu(secondary);
+}
+#endif
+
+struct machdep_calls __initdata pSeries_md = {
+ .probe = pSeries_probe,
+ .setup_arch = pSeries_setup_arch,
+ .init_early = pSeries_init_early,
+ .show_cpuinfo = pSeries_show_cpuinfo,
+ .log_error = pSeries_log_error,
+ .pcibios_fixup = pSeries_final_fixup,
+ .pci_probe_mode = pSeries_pci_probe_mode,
+ .irq_bus_setup = pSeries_irq_bus_setup,
+ .restart = rtas_restart,
+ .power_off = rtas_power_off,
+ .halt = rtas_halt,
+ .panic = rtas_os_term,
+ .cpu_die = pSeries_mach_cpu_die,
+ .get_boot_time = rtas_get_boot_time,
+ .get_rtc_time = rtas_get_rtc_time,
+ .set_rtc_time = rtas_set_rtc_time,
+ .calibrate_decr = generic_calibrate_decr,
+ .progress = rtas_progress,
+ .check_legacy_ioport = pSeries_check_legacy_ioport,
+ .system_reset_exception = pSeries_system_reset_exception,
+ .machine_check_exception = pSeries_machine_check_exception,
+#ifdef CONFIG_KEXEC
+ .kexec_cpu_down = pseries_kexec_cpu_down,
+#endif
+};
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
new file mode 100644
index 000000000000..25181c594d73
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -0,0 +1,474 @@
+/*
+ * SMP support for pSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#undef DEBUG
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <asm/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/time.h>
+#include <asm/machdep.h>
+#include "xics.h"
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/system.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/mpic.h>
+#include <asm/vdso_datapage.h>
+
+#include "plpar_wrappers.h"
+
+#ifdef DEBUG
+#include <asm/udbg.h>
+#define DBG(fmt...) udbg_printf(fmt)
+#else
+#define DBG(fmt...)
+#endif
+
+/*
+ * The primary thread of each non-boot processor is recorded here before
+ * smp init.
+ */
+static cpumask_t of_spin_map;
+
+extern void pSeries_secondary_smp_init(unsigned long);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Get state of physical CPU.
+ * Return codes:
+ * 0 - The processor is in the RTAS stopped state
+ * 1 - stop-self is in progress
+ * 2 - The processor is not in the RTAS stopped state
+ * -1 - Hardware Error
+ * -2 - Hardware Busy, Try again later.
+ */
+static int query_cpu_stopped(unsigned int pcpu)
+{
+ int cpu_status;
+ int status, qcss_tok;
+
+ qcss_tok = rtas_token("query-cpu-stopped-state");
+ if (qcss_tok == RTAS_UNKNOWN_SERVICE)
+ return -1;
+ status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+ if (status != 0) {
+ printk(KERN_ERR
+ "RTAS query-cpu-stopped-state failed: %i\n", status);
+ return status;
+ }
+
+ return cpu_status;
+}
+
+int pSeries_cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ cpu_clear(cpu, cpu_online_map);
+ vdso_data->processorCount--;
+
+ /*fix boot_cpuid here*/
+ if (cpu == boot_cpuid)
+ boot_cpuid = any_online_cpu(cpu_online_map);
+
+ /* FIXME: abstract this to not be platform specific later on */
+ xics_migrate_irqs_away();
+ return 0;
+}
+
+void pSeries_cpu_die(unsigned int cpu)
+{
+ int tries;
+ int cpu_status;
+ unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+ for (tries = 0; tries < 25; tries++) {
+ cpu_status = query_cpu_stopped(pcpu);
+ if (cpu_status == 0 || cpu_status == -1)
+ break;
+ msleep(200);
+ }
+ if (cpu_status != 0) {
+ printk("Querying DEAD? cpu %i (%i) shows %i\n",
+ cpu, pcpu, cpu_status);
+ }
+
+ /* Isolation and deallocation are definatly done by
+ * drslot_chrp_cpu. If they were not they would be
+ * done here. Change isolate state to Isolate and
+ * change allocation-state to Unusable.
+ */
+ paca[cpu].cpu_start = 0;
+}
+
+/*
+ * Update cpu_present_map and paca(s) for a new cpu node. The wrinkle
+ * here is that a cpu device node may represent up to two logical cpus
+ * in the SMT case. We must honor the assumption in other code that
+ * the logical ids for sibling SMT threads x and y are adjacent, such
+ * that x^1 == y and y^1 == x.
+ */
+static int pSeries_add_processor(struct device_node *np)
+{
+ unsigned int cpu;
+ cpumask_t candidate_map, tmp = CPU_MASK_NONE;
+ int err = -ENOSPC, len, nthreads, i;
+ u32 *intserv;
+
+ intserv = (u32 *)get_property(np, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return 0;
+
+ nthreads = len / sizeof(u32);
+ for (i = 0; i < nthreads; i++)
+ cpu_set(i, tmp);
+
+ lock_cpu_hotplug();
+
+ BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
+
+ /* Get a bitmap of unoccupied slots. */
+ cpus_xor(candidate_map, cpu_possible_map, cpu_present_map);
+ if (cpus_empty(candidate_map)) {
+ /* If we get here, it most likely means that NR_CPUS is
+ * less than the partition's max processors setting.
+ */
+ printk(KERN_ERR "Cannot add cpu %s; this system configuration"
+ " supports %d logical cpus.\n", np->full_name,
+ cpus_weight(cpu_possible_map));
+ goto out_unlock;
+ }
+
+ while (!cpus_empty(tmp))
+ if (cpus_subset(tmp, candidate_map))
+ /* Found a range where we can insert the new cpu(s) */
+ break;
+ else
+ cpus_shift_left(tmp, tmp, nthreads);
+
+ if (cpus_empty(tmp)) {
+ printk(KERN_ERR "Unable to find space in cpu_present_map for"
+ " processor %s with %d thread(s)\n", np->name,
+ nthreads);
+ goto out_unlock;
+ }
+
+ for_each_cpu_mask(cpu, tmp) {
+ BUG_ON(cpu_isset(cpu, cpu_present_map));
+ cpu_set(cpu, cpu_present_map);
+ set_hard_smp_processor_id(cpu, *intserv++);
+ }
+ err = 0;
+out_unlock:
+ unlock_cpu_hotplug();
+ return err;
+}
+
+/*
+ * Update the present map for a cpu node which is going away, and set
+ * the hard id in the paca(s) to -1 to be consistent with boot time
+ * convention for non-present cpus.
+ */
+static void pSeries_remove_processor(struct device_node *np)
+{
+ unsigned int cpu;
+ int len, nthreads, i;
+ u32 *intserv;
+
+ intserv = (u32 *)get_property(np, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return;
+
+ nthreads = len / sizeof(u32);
+
+ lock_cpu_hotplug();
+ for (i = 0; i < nthreads; i++) {
+ for_each_present_cpu(cpu) {
+ if (get_hard_smp_processor_id(cpu) != intserv[i])
+ continue;
+ BUG_ON(cpu_online(cpu));
+ cpu_clear(cpu, cpu_present_map);
+ set_hard_smp_processor_id(cpu, -1);
+ break;
+ }
+ if (cpu == NR_CPUS)
+ printk(KERN_WARNING "Could not find cpu to remove "
+ "with physical id 0x%x\n", intserv[i]);
+ }
+ unlock_cpu_hotplug();
+}
+
+static int pSeries_smp_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+ int err = NOTIFY_OK;
+
+ switch (action) {
+ case PSERIES_RECONFIG_ADD:
+ if (pSeries_add_processor(node))
+ err = NOTIFY_BAD;
+ break;
+ case PSERIES_RECONFIG_REMOVE:
+ pSeries_remove_processor(node);
+ break;
+ default:
+ err = NOTIFY_DONE;
+ break;
+ }
+ return err;
+}
+
+static struct notifier_block pSeries_smp_nb = {
+ .notifier_call = pSeries_smp_notifier,
+};
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do for primary threads which were
+ * started from Open Firmware. For anything else, call RTAS with the
+ * appropriate start location.
+ *
+ * Returns:
+ * 0 - failure
+ * 1 - success
+ */
+static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+{
+ int status;
+ unsigned long start_here = __pa((u32)*((unsigned long *)
+ pSeries_secondary_smp_init));
+ unsigned int pcpu;
+ int start_cpu;
+
+ if (cpu_isset(lcpu, of_spin_map))
+ /* Already started by OF and sitting in spin loop */
+ return 1;
+
+ pcpu = get_hard_smp_processor_id(lcpu);
+
+ /* Fixup atomic count: it exited inside IRQ handler. */
+ paca[lcpu].__current->thread_info->preempt_count = 0;
+
+ /*
+ * If the RTAS start-cpu token does not exist then presume the
+ * cpu is already spinning.
+ */
+ start_cpu = rtas_token("start-cpu");
+ if (start_cpu == RTAS_UNKNOWN_SERVICE)
+ return 1;
+
+ status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, lcpu);
+ if (status != 0) {
+ printk(KERN_ERR "start-cpu failed: %i\n", status);
+ return 0;
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_XICS
+static inline void smp_xics_do_message(int cpu, int msg)
+{
+ set_bit(msg, &xics_ipi_message[cpu].value);
+ mb();
+ xics_cause_IPI(cpu);
+}
+
+static void smp_xics_message_pass(int target, int msg)
+{
+ unsigned int i;
+
+ if (target < NR_CPUS) {
+ smp_xics_do_message(target, msg);
+ } else {
+ for_each_online_cpu(i) {
+ if (target == MSG_ALL_BUT_SELF
+ && i == smp_processor_id())
+ continue;
+ smp_xics_do_message(i, msg);
+ }
+ }
+}
+
+static int __init smp_xics_probe(void)
+{
+ xics_request_IPIs();
+
+ return cpus_weight(cpu_possible_map);
+}
+
+static void __devinit smp_xics_setup_cpu(int cpu)
+{
+ if (cpu != boot_cpuid)
+ xics_setup_cpu();
+
+ if (firmware_has_feature(FW_FEATURE_SPLPAR))
+ vpa_init(cpu);
+
+ cpu_clear(cpu, of_spin_map);
+
+}
+#endif /* CONFIG_XICS */
+
+static DEFINE_SPINLOCK(timebase_lock);
+static unsigned long timebase = 0;
+
+static void __devinit pSeries_give_timebase(void)
+{
+ spin_lock(&timebase_lock);
+ rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL);
+ timebase = get_tb();
+ spin_unlock(&timebase_lock);
+
+ while (timebase)
+ barrier();
+ rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL);
+}
+
+static void __devinit pSeries_take_timebase(void)
+{
+ while (!timebase)
+ barrier();
+ spin_lock(&timebase_lock);
+ set_tb(timebase >> 32, timebase & 0xffffffff);
+ timebase = 0;
+ spin_unlock(&timebase_lock);
+}
+
+static void __devinit smp_pSeries_kick_cpu(int nr)
+{
+ BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+ if (!smp_startup_cpu(nr))
+ return;
+
+ /*
+ * The processor is currently spinning, waiting for the
+ * cpu_start field to become non-zero After we set cpu_start,
+ * the processor will continue on to secondary_start
+ */
+ paca[nr].cpu_start = 1;
+}
+
+static int smp_pSeries_cpu_bootable(unsigned int nr)
+{
+ /* Special case - we inhibit secondary thread startup
+ * during boot if the user requests it. Odd-numbered
+ * cpus are assumed to be secondary threads.
+ */
+ if (system_state < SYSTEM_RUNNING &&
+ cpu_has_feature(CPU_FTR_SMT) &&
+ !smt_enabled_at_boot && nr % 2 != 0)
+ return 0;
+
+ return 1;
+}
+#ifdef CONFIG_MPIC
+static struct smp_ops_t pSeries_mpic_smp_ops = {
+ .message_pass = smp_mpic_message_pass,
+ .probe = smp_mpic_probe,
+ .kick_cpu = smp_pSeries_kick_cpu,
+ .setup_cpu = smp_mpic_setup_cpu,
+};
+#endif
+#ifdef CONFIG_XICS
+static struct smp_ops_t pSeries_xics_smp_ops = {
+ .message_pass = smp_xics_message_pass,
+ .probe = smp_xics_probe,
+ .kick_cpu = smp_pSeries_kick_cpu,
+ .setup_cpu = smp_xics_setup_cpu,
+ .cpu_bootable = smp_pSeries_cpu_bootable,
+};
+#endif
+
+/* This is called very early */
+void __init smp_init_pSeries(void)
+{
+ int i;
+
+ DBG(" -> smp_init_pSeries()\n");
+
+ switch (ppc64_interrupt_controller) {
+#ifdef CONFIG_MPIC
+ case IC_OPEN_PIC:
+ smp_ops = &pSeries_mpic_smp_ops;
+ break;
+#endif
+#ifdef CONFIG_XICS
+ case IC_PPC_XIC:
+ smp_ops = &pSeries_xics_smp_ops;
+ break;
+#endif
+ default:
+ panic("Invalid interrupt controller");
+ }
+
+#ifdef CONFIG_HOTPLUG_CPU
+ smp_ops->cpu_disable = pSeries_cpu_disable;
+ smp_ops->cpu_die = pSeries_cpu_die;
+
+ /* Processors can be added/removed only on LPAR */
+ if (platform_is_lpar())
+ pSeries_reconfig_notifier_register(&pSeries_smp_nb);
+#endif
+
+ /* Mark threads which are still spinning in hold loops. */
+ if (cpu_has_feature(CPU_FTR_SMT)) {
+ for_each_present_cpu(i) {
+ if (i % 2 == 0)
+ /*
+ * Even-numbered logical cpus correspond to
+ * primary threads.
+ */
+ cpu_set(i, of_spin_map);
+ }
+ } else {
+ of_spin_map = cpu_present_map;
+ }
+
+ cpu_clear(boot_cpuid, of_spin_map);
+
+ /* Non-lpar has additional take/give timebase */
+ if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+ smp_ops->give_timebase = pSeries_give_timebase;
+ smp_ops->take_timebase = pSeries_take_timebase;
+ }
+
+ DBG(" <- smp_init_pSeries()\n");
+}
+
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
new file mode 100644
index 000000000000..866379b80c09
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -0,0 +1,274 @@
+/*
+ * IBM PowerPC pSeries Virtual I/O Infrastructure Support.
+ *
+ * Copyright (c) 2003-2005 IBM Corp.
+ * Dave Engebretsen engebret@us.ibm.com
+ * Santiago Leon santil@us.ibm.com
+ * Hollis Blanchard <hollisb@us.ibm.com>
+ * Stephen Rothwell
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/kobject.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/prom.h>
+#include <asm/vio.h>
+#include <asm/hvcall.h>
+#include <asm/tce.h>
+
+extern struct subsystem devices_subsys; /* needed for vio_find_name() */
+
+static void probe_bus_pseries(void)
+{
+ struct device_node *node_vroot, *of_node;
+
+ node_vroot = find_devices("vdevice");
+ if ((node_vroot == NULL) || (node_vroot->child == NULL))
+ /* this machine doesn't do virtual IO, and that's ok */
+ return;
+
+ /*
+ * Create struct vio_devices for each virtual device in the device tree.
+ * Drivers will associate with them later.
+ */
+ for (of_node = node_vroot->child; of_node != NULL;
+ of_node = of_node->sibling) {
+ printk(KERN_DEBUG "%s: processing %p\n", __FUNCTION__, of_node);
+ vio_register_device_node(of_node);
+ }
+}
+
+/**
+ * vio_match_device_pseries: - Tell if a pSeries VIO device matches a
+ * vio_device_id
+ */
+static int vio_match_device_pseries(const struct vio_device_id *id,
+ const struct vio_dev *dev)
+{
+ return (strncmp(dev->type, id->type, strlen(id->type)) == 0) &&
+ device_is_compatible(dev->dev.platform_data, id->compat);
+}
+
+static void vio_release_device_pseries(struct device *dev)
+{
+ /* XXX free TCE table */
+ of_node_put(dev->platform_data);
+}
+
+static ssize_t viodev_show_devspec(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct device_node *of_node = dev->platform_data;
+
+ return sprintf(buf, "%s\n", of_node->full_name);
+}
+DEVICE_ATTR(devspec, S_IRUSR | S_IRGRP | S_IROTH, viodev_show_devspec, NULL);
+
+static void vio_unregister_device_pseries(struct vio_dev *viodev)
+{
+ device_remove_file(&viodev->dev, &dev_attr_devspec);
+}
+
+static struct vio_bus_ops vio_bus_ops_pseries = {
+ .match = vio_match_device_pseries,
+ .unregister_device = vio_unregister_device_pseries,
+ .release_device = vio_release_device_pseries,
+};
+
+/**
+ * vio_bus_init_pseries: - Initialize the pSeries virtual IO bus
+ */
+static int __init vio_bus_init_pseries(void)
+{
+ int err;
+
+ err = vio_bus_init(&vio_bus_ops_pseries);
+ if (err == 0)
+ probe_bus_pseries();
+ return err;
+}
+
+__initcall(vio_bus_init_pseries);
+
+/**
+ * vio_build_iommu_table: - gets the dma information from OF and
+ * builds the TCE tree.
+ * @dev: the virtual device.
+ *
+ * Returns a pointer to the built tce tree, or NULL if it can't
+ * find property.
+*/
+static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
+{
+ unsigned int *dma_window;
+ struct iommu_table *newTceTable;
+ unsigned long offset;
+ int dma_window_property_size;
+
+ dma_window = (unsigned int *) get_property(dev->dev.platform_data, "ibm,my-dma-window", &dma_window_property_size);
+ if(!dma_window) {
+ return NULL;
+ }
+
+ newTceTable = (struct iommu_table *) kmalloc(sizeof(struct iommu_table), GFP_KERNEL);
+
+ /* There should be some code to extract the phys-encoded offset
+ using prom_n_addr_cells(). However, according to a comment
+ on earlier versions, it's always zero, so we don't bother */
+ offset = dma_window[1] >> PAGE_SHIFT;
+
+ /* TCE table size - measured in tce entries */
+ newTceTable->it_size = dma_window[4] >> PAGE_SHIFT;
+ /* offset for VIO should always be 0 */
+ newTceTable->it_offset = offset;
+ newTceTable->it_busno = 0;
+ newTceTable->it_index = (unsigned long)dma_window[0];
+ newTceTable->it_type = TCE_VB;
+
+ return iommu_init_table(newTceTable);
+}
+
+/**
+ * vio_register_device_node: - Register a new vio device.
+ * @of_node: The OF node for this device.
+ *
+ * Creates and initializes a vio_dev structure from the data in
+ * of_node (dev.platform_data) and adds it to the list of virtual devices.
+ * Returns a pointer to the created vio_dev or NULL if node has
+ * NULL device_type or compatible fields.
+ */
+struct vio_dev * __devinit vio_register_device_node(struct device_node *of_node)
+{
+ struct vio_dev *viodev;
+ unsigned int *unit_address;
+ unsigned int *irq_p;
+
+ /* we need the 'device_type' property, in order to match with drivers */
+ if ((NULL == of_node->type)) {
+ printk(KERN_WARNING
+ "%s: node %s missing 'device_type'\n", __FUNCTION__,
+ of_node->name ? of_node->name : "<unknown>");
+ return NULL;
+ }
+
+ unit_address = (unsigned int *)get_property(of_node, "reg", NULL);
+ if (!unit_address) {
+ printk(KERN_WARNING "%s: node %s missing 'reg'\n", __FUNCTION__,
+ of_node->name ? of_node->name : "<unknown>");
+ return NULL;
+ }
+
+ /* allocate a vio_dev for this node */
+ viodev = kmalloc(sizeof(struct vio_dev), GFP_KERNEL);
+ if (!viodev) {
+ return NULL;
+ }
+ memset(viodev, 0, sizeof(struct vio_dev));
+
+ viodev->dev.platform_data = of_node_get(of_node);
+
+ viodev->irq = NO_IRQ;
+ irq_p = (unsigned int *)get_property(of_node, "interrupts", NULL);
+ if (irq_p) {
+ int virq = virt_irq_create_mapping(*irq_p);
+ if (virq == NO_IRQ) {
+ printk(KERN_ERR "Unable to allocate interrupt "
+ "number for %s\n", of_node->full_name);
+ } else
+ viodev->irq = irq_offset_up(virq);
+ }
+
+ snprintf(viodev->dev.bus_id, BUS_ID_SIZE, "%x", *unit_address);
+ viodev->name = of_node->name;
+ viodev->type = of_node->type;
+ viodev->unit_address = *unit_address;
+ viodev->iommu_table = vio_build_iommu_table(viodev);
+
+ /* register with generic device framework */
+ if (vio_register_device(viodev) == NULL) {
+ /* XXX free TCE table */
+ kfree(viodev);
+ return NULL;
+ }
+ device_create_file(&viodev->dev, &dev_attr_devspec);
+
+ return viodev;
+}
+EXPORT_SYMBOL(vio_register_device_node);
+
+/**
+ * vio_get_attribute: - get attribute for virtual device
+ * @vdev: The vio device to get property.
+ * @which: The property/attribute to be extracted.
+ * @length: Pointer to length of returned data size (unused if NULL).
+ *
+ * Calls prom.c's get_property() to return the value of the
+ * attribute specified by the preprocessor constant @which
+*/
+const void * vio_get_attribute(struct vio_dev *vdev, void* which, int* length)
+{
+ return get_property(vdev->dev.platform_data, (char*)which, length);
+}
+EXPORT_SYMBOL(vio_get_attribute);
+
+/* vio_find_name() - internal because only vio.c knows how we formatted the
+ * kobject name
+ * XXX once vio_bus_type.devices is actually used as a kset in
+ * drivers/base/bus.c, this function should be removed in favor of
+ * "device_find(kobj_name, &vio_bus_type)"
+ */
+static struct vio_dev *vio_find_name(const char *kobj_name)
+{
+ struct kobject *found;
+
+ found = kset_find_obj(&devices_subsys.kset, kobj_name);
+ if (!found)
+ return NULL;
+
+ return to_vio_dev(container_of(found, struct device, kobj));
+}
+
+/**
+ * vio_find_node - find an already-registered vio_dev
+ * @vnode: device_node of the virtual device we're looking for
+ */
+struct vio_dev *vio_find_node(struct device_node *vnode)
+{
+ uint32_t *unit_address;
+ char kobj_name[BUS_ID_SIZE];
+
+ /* construct the kobject name from the device node */
+ unit_address = (uint32_t *)get_property(vnode, "reg", NULL);
+ if (!unit_address)
+ return NULL;
+ snprintf(kobj_name, BUS_ID_SIZE, "%x", *unit_address);
+
+ return vio_find_name(kobj_name);
+}
+EXPORT_SYMBOL(vio_find_node);
+
+int vio_enable_interrupts(struct vio_dev *dev)
+{
+ int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
+ if (rc != H_Success)
+ printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(vio_enable_interrupts);
+
+int vio_disable_interrupts(struct vio_dev *dev)
+{
+ int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
+ if (rc != H_Success)
+ printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
+ return rc;
+}
+EXPORT_SYMBOL(vio_disable_interrupts);
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
new file mode 100644
index 000000000000..72ac18067ece
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -0,0 +1,748 @@
+/*
+ * arch/powerpc/platforms/pseries/xics.c
+ *
+ * Copyright 2000 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/signal.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/radix-tree.h>
+#include <linux/cpu.h>
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/smp.h>
+#include <asm/rtas.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/i8259.h>
+
+#include "xics.h"
+
+static unsigned int xics_startup(unsigned int irq);
+static void xics_enable_irq(unsigned int irq);
+static void xics_disable_irq(unsigned int irq);
+static void xics_mask_and_ack_irq(unsigned int irq);
+static void xics_end_irq(unsigned int irq);
+static void xics_set_affinity(unsigned int irq_nr, cpumask_t cpumask);
+
+static struct hw_interrupt_type xics_pic = {
+ .typename = " XICS ",
+ .startup = xics_startup,
+ .enable = xics_enable_irq,
+ .disable = xics_disable_irq,
+ .ack = xics_mask_and_ack_irq,
+ .end = xics_end_irq,
+ .set_affinity = xics_set_affinity
+};
+
+static struct hw_interrupt_type xics_8259_pic = {
+ .typename = " XICS/8259",
+ .ack = xics_mask_and_ack_irq,
+};
+
+/* This is used to map real irq numbers to virtual */
+static struct radix_tree_root irq_map = RADIX_TREE_INIT(GFP_ATOMIC);
+
+#define XICS_IPI 2
+#define XICS_IRQ_SPURIOUS 0
+
+/* Want a priority other than 0. Various HW issues require this. */
+#define DEFAULT_PRIORITY 5
+
+/*
+ * Mark IPIs as higher priority so we can take them inside interrupts that
+ * arent marked SA_INTERRUPT
+ */
+#define IPI_PRIORITY 4
+
+struct xics_ipl {
+ union {
+ u32 word;
+ u8 bytes[4];
+ } xirr_poll;
+ union {
+ u32 word;
+ u8 bytes[4];
+ } xirr;
+ u32 dummy;
+ union {
+ u32 word;
+ u8 bytes[4];
+ } qirr;
+};
+
+static struct xics_ipl __iomem *xics_per_cpu[NR_CPUS];
+
+static int xics_irq_8259_cascade = 0;
+static int xics_irq_8259_cascade_real = 0;
+static unsigned int default_server = 0xFF;
+static unsigned int default_distrib_server = 0;
+static unsigned int interrupt_server_size = 8;
+
+/*
+ * XICS only has a single IPI, so encode the messages per CPU
+ */
+struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
+
+/* RTAS service tokens */
+static int ibm_get_xive;
+static int ibm_set_xive;
+static int ibm_int_on;
+static int ibm_int_off;
+
+typedef struct {
+ int (*xirr_info_get)(int cpu);
+ void (*xirr_info_set)(int cpu, int val);
+ void (*cppr_info)(int cpu, u8 val);
+ void (*qirr_info)(int cpu, u8 val);
+} xics_ops;
+
+
+/* SMP */
+
+static int pSeries_xirr_info_get(int n_cpu)
+{
+ return in_be32(&xics_per_cpu[n_cpu]->xirr.word);
+}
+
+static void pSeries_xirr_info_set(int n_cpu, int value)
+{
+ out_be32(&xics_per_cpu[n_cpu]->xirr.word, value);
+}
+
+static void pSeries_cppr_info(int n_cpu, u8 value)
+{
+ out_8(&xics_per_cpu[n_cpu]->xirr.bytes[0], value);
+}
+
+static void pSeries_qirr_info(int n_cpu, u8 value)
+{
+ out_8(&xics_per_cpu[n_cpu]->qirr.bytes[0], value);
+}
+
+static xics_ops pSeries_ops = {
+ pSeries_xirr_info_get,
+ pSeries_xirr_info_set,
+ pSeries_cppr_info,
+ pSeries_qirr_info
+};
+
+static xics_ops *ops = &pSeries_ops;
+
+
+/* LPAR */
+
+static inline long plpar_eoi(unsigned long xirr)
+{
+ return plpar_hcall_norets(H_EOI, xirr);
+}
+
+static inline long plpar_cppr(unsigned long cppr)
+{
+ return plpar_hcall_norets(H_CPPR, cppr);
+}
+
+static inline long plpar_ipi(unsigned long servernum, unsigned long mfrr)
+{
+ return plpar_hcall_norets(H_IPI, servernum, mfrr);
+}
+
+static inline long plpar_xirr(unsigned long *xirr_ret)
+{
+ unsigned long dummy;
+ return plpar_hcall(H_XIRR, 0, 0, 0, 0, xirr_ret, &dummy, &dummy);
+}
+
+static int pSeriesLP_xirr_info_get(int n_cpu)
+{
+ unsigned long lpar_rc;
+ unsigned long return_value;
+
+ lpar_rc = plpar_xirr(&return_value);
+ if (lpar_rc != H_Success)
+ panic(" bad return code xirr - rc = %lx \n", lpar_rc);
+ return (int)return_value;
+}
+
+static void pSeriesLP_xirr_info_set(int n_cpu, int value)
+{
+ unsigned long lpar_rc;
+ unsigned long val64 = value & 0xffffffff;
+
+ lpar_rc = plpar_eoi(val64);
+ if (lpar_rc != H_Success)
+ panic("bad return code EOI - rc = %ld, value=%lx\n", lpar_rc,
+ val64);
+}
+
+void pSeriesLP_cppr_info(int n_cpu, u8 value)
+{
+ unsigned long lpar_rc;
+
+ lpar_rc = plpar_cppr(value);
+ if (lpar_rc != H_Success)
+ panic("bad return code cppr - rc = %lx\n", lpar_rc);
+}
+
+static void pSeriesLP_qirr_info(int n_cpu , u8 value)
+{
+ unsigned long lpar_rc;
+
+ lpar_rc = plpar_ipi(get_hard_smp_processor_id(n_cpu), value);
+ if (lpar_rc != H_Success)
+ panic("bad return code qirr - rc = %lx\n", lpar_rc);
+}
+
+xics_ops pSeriesLP_ops = {
+ pSeriesLP_xirr_info_get,
+ pSeriesLP_xirr_info_set,
+ pSeriesLP_cppr_info,
+ pSeriesLP_qirr_info
+};
+
+static unsigned int xics_startup(unsigned int virq)
+{
+ unsigned int irq;
+
+ irq = irq_offset_down(virq);
+ if (radix_tree_insert(&irq_map, virt_irq_to_real(irq),
+ &virt_irq_to_real_map[irq]) == -ENOMEM)
+ printk(KERN_CRIT "Out of memory creating real -> virtual"
+ " IRQ mapping for irq %u (real 0x%x)\n",
+ virq, virt_irq_to_real(irq));
+ xics_enable_irq(virq);
+ return 0; /* return value is ignored */
+}
+
+static unsigned int real_irq_to_virt(unsigned int real_irq)
+{
+ unsigned int *ptr;
+
+ ptr = radix_tree_lookup(&irq_map, real_irq);
+ if (ptr == NULL)
+ return NO_IRQ;
+ return ptr - virt_irq_to_real_map;
+}
+
+#ifdef CONFIG_SMP
+static int get_irq_server(unsigned int irq)
+{
+ unsigned int server;
+ /* For the moment only implement delivery to all cpus or one cpu */
+ cpumask_t cpumask = irq_affinity[irq];
+ cpumask_t tmp = CPU_MASK_NONE;
+
+ if (!distribute_irqs)
+ return default_server;
+
+ if (cpus_equal(cpumask, CPU_MASK_ALL)) {
+ server = default_distrib_server;
+ } else {
+ cpus_and(tmp, cpu_online_map, cpumask);
+
+ if (cpus_empty(tmp))
+ server = default_distrib_server;
+ else
+ server = get_hard_smp_processor_id(first_cpu(tmp));
+ }
+
+ return server;
+
+}
+#else
+static int get_irq_server(unsigned int irq)
+{
+ return default_server;
+}
+#endif
+
+static void xics_enable_irq(unsigned int virq)
+{
+ unsigned int irq;
+ int call_status;
+ unsigned int server;
+
+ irq = virt_irq_to_real(irq_offset_down(virq));
+ if (irq == XICS_IPI)
+ return;
+
+ server = get_irq_server(virq);
+ call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server,
+ DEFAULT_PRIORITY);
+ if (call_status != 0) {
+ printk(KERN_ERR "xics_enable_irq: irq=%u: ibm_set_xive "
+ "returned %d\n", irq, call_status);
+ printk("set_xive %x, server %x\n", ibm_set_xive, server);
+ return;
+ }
+
+ /* Now unmask the interrupt (often a no-op) */
+ call_status = rtas_call(ibm_int_on, 1, 1, NULL, irq);
+ if (call_status != 0) {
+ printk(KERN_ERR "xics_enable_irq: irq=%u: ibm_int_on "
+ "returned %d\n", irq, call_status);
+ return;
+ }
+}
+
+static void xics_disable_real_irq(unsigned int irq)
+{
+ int call_status;
+ unsigned int server;
+
+ if (irq == XICS_IPI)
+ return;
+
+ call_status = rtas_call(ibm_int_off, 1, 1, NULL, irq);
+ if (call_status != 0) {
+ printk(KERN_ERR "xics_disable_real_irq: irq=%u: "
+ "ibm_int_off returned %d\n", irq, call_status);
+ return;
+ }
+
+ server = get_irq_server(irq);
+ /* Have to set XIVE to 0xff to be able to remove a slot */
+ call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, 0xff);
+ if (call_status != 0) {
+ printk(KERN_ERR "xics_disable_irq: irq=%u: ibm_set_xive(0xff)"
+ " returned %d\n", irq, call_status);
+ return;
+ }
+}
+
+static void xics_disable_irq(unsigned int virq)
+{
+ unsigned int irq;
+
+ irq = virt_irq_to_real(irq_offset_down(virq));
+ xics_disable_real_irq(irq);
+}
+
+static void xics_end_irq(unsigned int irq)
+{
+ int cpu = smp_processor_id();
+
+ iosync();
+ ops->xirr_info_set(cpu, ((0xff << 24) |
+ (virt_irq_to_real(irq_offset_down(irq)))));
+
+}
+
+static void xics_mask_and_ack_irq(unsigned int irq)
+{
+ int cpu = smp_processor_id();
+
+ if (irq < irq_offset_value()) {
+ i8259_pic.ack(irq);
+ iosync();
+ ops->xirr_info_set(cpu, ((0xff<<24) |
+ xics_irq_8259_cascade_real));
+ iosync();
+ }
+}
+
+int xics_get_irq(struct pt_regs *regs)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned int vec;
+ int irq;
+
+ vec = ops->xirr_info_get(cpu);
+ /* (vec >> 24) == old priority */
+ vec &= 0x00ffffff;
+
+ /* for sanity, this had better be < NR_IRQS - 16 */
+ if (vec == xics_irq_8259_cascade_real) {
+ irq = i8259_irq(regs);
+ if (irq == -1) {
+ /* Spurious cascaded interrupt. Still must ack xics */
+ xics_end_irq(irq_offset_up(xics_irq_8259_cascade));
+
+ irq = -1;
+ }
+ } else if (vec == XICS_IRQ_SPURIOUS) {
+ irq = -1;
+ } else {
+ irq = real_irq_to_virt(vec);
+ if (irq == NO_IRQ)
+ irq = real_irq_to_virt_slowpath(vec);
+ if (irq == NO_IRQ) {
+ printk(KERN_ERR "Interrupt %u (real) is invalid,"
+ " disabling it.\n", vec);
+ xics_disable_real_irq(vec);
+ } else
+ irq = irq_offset_up(irq);
+ }
+ return irq;
+}
+
+#ifdef CONFIG_SMP
+
+irqreturn_t xics_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
+{
+ int cpu = smp_processor_id();
+
+ ops->qirr_info(cpu, 0xff);
+
+ WARN_ON(cpu_is_offline(cpu));
+
+ while (xics_ipi_message[cpu].value) {
+ if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION,
+ &xics_ipi_message[cpu].value)) {
+ mb();
+ smp_message_recv(PPC_MSG_CALL_FUNCTION, regs);
+ }
+ if (test_and_clear_bit(PPC_MSG_RESCHEDULE,
+ &xics_ipi_message[cpu].value)) {
+ mb();
+ smp_message_recv(PPC_MSG_RESCHEDULE, regs);
+ }
+#if 0
+ if (test_and_clear_bit(PPC_MSG_MIGRATE_TASK,
+ &xics_ipi_message[cpu].value)) {
+ mb();
+ smp_message_recv(PPC_MSG_MIGRATE_TASK, regs);
+ }
+#endif
+#ifdef CONFIG_DEBUGGER
+ if (test_and_clear_bit(PPC_MSG_DEBUGGER_BREAK,
+ &xics_ipi_message[cpu].value)) {
+ mb();
+ smp_message_recv(PPC_MSG_DEBUGGER_BREAK, regs);
+ }
+#endif
+ }
+ return IRQ_HANDLED;
+}
+
+void xics_cause_IPI(int cpu)
+{
+ ops->qirr_info(cpu, IPI_PRIORITY);
+}
+#endif /* CONFIG_SMP */
+
+void xics_setup_cpu(void)
+{
+ int cpu = smp_processor_id();
+
+ ops->cppr_info(cpu, 0xff);
+ iosync();
+
+ /*
+ * Put the calling processor into the GIQ. This is really only
+ * necessary from a secondary thread as the OF start-cpu interface
+ * performs this function for us on primary threads.
+ *
+ * XXX: undo of teardown on kexec needs this too, as may hotplug
+ */
+ rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+ (1UL << interrupt_server_size) - 1 - default_distrib_server, 1);
+}
+
+void xics_init_IRQ(void)
+{
+ int i;
+ unsigned long intr_size = 0;
+ struct device_node *np;
+ uint *ireg, ilen, indx = 0;
+ unsigned long intr_base = 0;
+ struct xics_interrupt_node {
+ unsigned long addr;
+ unsigned long size;
+ } intnodes[NR_CPUS];
+
+ ppc64_boot_msg(0x20, "XICS Init");
+
+ ibm_get_xive = rtas_token("ibm,get-xive");
+ ibm_set_xive = rtas_token("ibm,set-xive");
+ ibm_int_on = rtas_token("ibm,int-on");
+ ibm_int_off = rtas_token("ibm,int-off");
+
+ np = of_find_node_by_type(NULL, "PowerPC-External-Interrupt-Presentation");
+ if (!np)
+ panic("xics_init_IRQ: can't find interrupt presentation");
+
+nextnode:
+ ireg = (uint *)get_property(np, "ibm,interrupt-server-ranges", NULL);
+ if (ireg) {
+ /*
+ * set node starting index for this node
+ */
+ indx = *ireg;
+ }
+
+ ireg = (uint *)get_property(np, "reg", &ilen);
+ if (!ireg)
+ panic("xics_init_IRQ: can't find interrupt reg property");
+
+ while (ilen) {
+ intnodes[indx].addr = (unsigned long)*ireg++ << 32;
+ ilen -= sizeof(uint);
+ intnodes[indx].addr |= *ireg++;
+ ilen -= sizeof(uint);
+ intnodes[indx].size = (unsigned long)*ireg++ << 32;
+ ilen -= sizeof(uint);
+ intnodes[indx].size |= *ireg++;
+ ilen -= sizeof(uint);
+ indx++;
+ if (indx >= NR_CPUS) break;
+ }
+
+ np = of_find_node_by_type(np, "PowerPC-External-Interrupt-Presentation");
+ if ((indx < NR_CPUS) && np) goto nextnode;
+
+ /* Find the server numbers for the boot cpu. */
+ for (np = of_find_node_by_type(NULL, "cpu");
+ np;
+ np = of_find_node_by_type(np, "cpu")) {
+ ireg = (uint *)get_property(np, "reg", &ilen);
+ if (ireg && ireg[0] == boot_cpuid_phys) {
+ ireg = (uint *)get_property(np, "ibm,ppc-interrupt-gserver#s",
+ &ilen);
+ i = ilen / sizeof(int);
+ if (ireg && i > 0) {
+ default_server = ireg[0];
+ default_distrib_server = ireg[i-1]; /* take last element */
+ }
+ ireg = (uint *)get_property(np,
+ "ibm,interrupt-server#-size", NULL);
+ if (ireg)
+ interrupt_server_size = *ireg;
+ break;
+ }
+ }
+ of_node_put(np);
+
+ intr_base = intnodes[0].addr;
+ intr_size = intnodes[0].size;
+
+ np = of_find_node_by_type(NULL, "interrupt-controller");
+ if (!np) {
+ printk(KERN_WARNING "xics: no ISA interrupt controller\n");
+ xics_irq_8259_cascade_real = -1;
+ xics_irq_8259_cascade = -1;
+ } else {
+ ireg = (uint *) get_property(np, "interrupts", NULL);
+ if (!ireg)
+ panic("xics_init_IRQ: can't find ISA interrupts property");
+
+ xics_irq_8259_cascade_real = *ireg;
+ xics_irq_8259_cascade
+ = virt_irq_create_mapping(xics_irq_8259_cascade_real);
+ of_node_put(np);
+ }
+
+ if (platform_is_lpar())
+ ops = &pSeriesLP_ops;
+ else {
+#ifdef CONFIG_SMP
+ for_each_cpu(i) {
+ int hard_id;
+
+ /* FIXME: Do this dynamically! --RR */
+ if (!cpu_present(i))
+ continue;
+
+ hard_id = get_hard_smp_processor_id(i);
+ xics_per_cpu[i] = ioremap(intnodes[hard_id].addr,
+ intnodes[hard_id].size);
+ }
+#else
+ xics_per_cpu[0] = ioremap(intr_base, intr_size);
+#endif /* CONFIG_SMP */
+ }
+
+ xics_8259_pic.enable = i8259_pic.enable;
+ xics_8259_pic.disable = i8259_pic.disable;
+ xics_8259_pic.end = i8259_pic.end;
+ for (i = 0; i < 16; ++i)
+ get_irq_desc(i)->handler = &xics_8259_pic;
+ for (; i < NR_IRQS; ++i)
+ get_irq_desc(i)->handler = &xics_pic;
+
+ xics_setup_cpu();
+
+ ppc64_boot_msg(0x21, "XICS Done");
+}
+
+/*
+ * We cant do this in init_IRQ because we need the memory subsystem up for
+ * request_irq()
+ */
+static int __init xics_setup_i8259(void)
+{
+ if (ppc64_interrupt_controller == IC_PPC_XIC &&
+ xics_irq_8259_cascade != -1) {
+ if (request_irq(irq_offset_up(xics_irq_8259_cascade),
+ no_action, 0, "8259 cascade", NULL))
+ printk(KERN_ERR "xics_setup_i8259: couldn't get 8259 "
+ "cascade\n");
+ i8259_init(0, 0);
+ }
+ return 0;
+}
+arch_initcall(xics_setup_i8259);
+
+#ifdef CONFIG_SMP
+void xics_request_IPIs(void)
+{
+ virt_irq_to_real_map[XICS_IPI] = XICS_IPI;
+
+ /* IPIs are marked SA_INTERRUPT as they must run with irqs disabled */
+ request_irq(irq_offset_up(XICS_IPI), xics_ipi_action, SA_INTERRUPT,
+ "IPI", NULL);
+ get_irq_desc(irq_offset_up(XICS_IPI))->status |= IRQ_PER_CPU;
+}
+#endif
+
+static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+{
+ unsigned int irq;
+ int status;
+ int xics_status[2];
+ unsigned long newmask;
+ cpumask_t tmp = CPU_MASK_NONE;
+
+ irq = virt_irq_to_real(irq_offset_down(virq));
+ if (irq == XICS_IPI || irq == NO_IRQ)
+ return;
+
+ status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
+
+ if (status) {
+ printk(KERN_ERR "xics_set_affinity: irq=%u ibm,get-xive "
+ "returns %d\n", irq, status);
+ return;
+ }
+
+ /* For the moment only implement delivery to all cpus or one cpu */
+ if (cpus_equal(cpumask, CPU_MASK_ALL)) {
+ newmask = default_distrib_server;
+ } else {
+ cpus_and(tmp, cpu_online_map, cpumask);
+ if (cpus_empty(tmp))
+ return;
+ newmask = get_hard_smp_processor_id(first_cpu(tmp));
+ }
+
+ status = rtas_call(ibm_set_xive, 3, 1, NULL,
+ irq, newmask, xics_status[1]);
+
+ if (status) {
+ printk(KERN_ERR "xics_set_affinity: irq=%u ibm,set-xive "
+ "returns %d\n", irq, status);
+ return;
+ }
+}
+
+void xics_teardown_cpu(int secondary)
+{
+ int cpu = smp_processor_id();
+
+ ops->cppr_info(cpu, 0x00);
+ iosync();
+
+ /*
+ * Some machines need to have at least one cpu in the GIQ,
+ * so leave the master cpu in the group.
+ */
+ if (secondary) {
+ /*
+ * we need to EOI the IPI if we got here from kexec down IPI
+ *
+ * probably need to check all the other interrupts too
+ * should we be flagging idle loop instead?
+ * or creating some task to be scheduled?
+ */
+ ops->xirr_info_set(cpu, XICS_IPI);
+ rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+ (1UL << interrupt_server_size) - 1 -
+ default_distrib_server, 0);
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Interrupts are disabled. */
+void xics_migrate_irqs_away(void)
+{
+ int status;
+ unsigned int irq, virq, cpu = smp_processor_id();
+
+ /* Reject any interrupt that was queued to us... */
+ ops->cppr_info(cpu, 0);
+ iosync();
+
+ /* remove ourselves from the global interrupt queue */
+ status = rtas_set_indicator(GLOBAL_INTERRUPT_QUEUE,
+ (1UL << interrupt_server_size) - 1 - default_distrib_server, 0);
+ WARN_ON(status < 0);
+
+ /* Allow IPIs again... */
+ ops->cppr_info(cpu, DEFAULT_PRIORITY);
+ iosync();
+
+ for_each_irq(virq) {
+ irq_desc_t *desc;
+ int xics_status[2];
+ unsigned long flags;
+
+ /* We cant set affinity on ISA interrupts */
+ if (virq < irq_offset_value())
+ continue;
+
+ desc = get_irq_desc(virq);
+ irq = virt_irq_to_real(irq_offset_down(virq));
+
+ /* We need to get IPIs still. */
+ if (irq == XICS_IPI || irq == NO_IRQ)
+ continue;
+
+ /* We only need to migrate enabled IRQS */
+ if (desc == NULL || desc->handler == NULL
+ || desc->action == NULL
+ || desc->handler->set_affinity == NULL)
+ continue;
+
+ spin_lock_irqsave(&desc->lock, flags);
+
+ status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
+ if (status) {
+ printk(KERN_ERR "migrate_irqs_away: irq=%u "
+ "ibm,get-xive returns %d\n",
+ virq, status);
+ goto unlock;
+ }
+
+ /*
+ * We only support delivery to all cpus or to one cpu.
+ * The irq has to be migrated only in the single cpu
+ * case.
+ */
+ if (xics_status[0] != get_hard_smp_processor_id(cpu))
+ goto unlock;
+
+ printk(KERN_WARNING "IRQ %u affinity broken off cpu %u\n",
+ virq, cpu);
+
+ /* Reset affinity to all cpus */
+ desc->handler->set_affinity(virq, CPU_MASK_ALL);
+ irq_affinity[virq] = CPU_MASK_ALL;
+unlock:
+ spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
+#endif
diff --git a/arch/powerpc/platforms/pseries/xics.h b/arch/powerpc/platforms/pseries/xics.h
new file mode 100644
index 000000000000..e14c70868f1d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/xics.h
@@ -0,0 +1,34 @@
+/*
+ * arch/powerpc/platforms/pseries/xics.h
+ *
+ * Copyright 2000 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _POWERPC_KERNEL_XICS_H
+#define _POWERPC_KERNEL_XICS_H
+
+#include <linux/cache.h>
+
+void xics_init_IRQ(void);
+int xics_get_irq(struct pt_regs *);
+void xics_setup_cpu(void);
+void xics_teardown_cpu(int secondary);
+void xics_cause_IPI(int cpu);
+void xics_request_IPIs(void);
+void xics_migrate_irqs_away(void);
+
+/* first argument is ignored for now*/
+void pSeriesLP_cppr_info(int n_cpu, u8 value);
+
+struct xics_ipi_struct {
+ volatile unsigned long value;
+} ____cacheline_aligned;
+
+extern struct xics_ipi_struct xics_ipi_message[NR_CPUS] __cacheline_aligned;
+
+#endif /* _POWERPC_KERNEL_XICS_H */