From 674c6479b7bdc78528ea83dd43897e3161558b8b Mon Sep 17 00:00:00 2001
From: Colin Ngam <cngam@sgi.com>
Date: Wed, 3 Aug 2005 13:35:00 -0700
Subject: [IA64-SGI] Altix only: Add PCI Domain number support.

This patch enables PCI Domain numbering on Altix.

Signed-off-by: Colin Ngam <cngam@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/include/xtalk/hubdev.h     |  6 ++++--
 arch/ia64/sn/kernel/io_init.c           | 33 +++++++++++++++++++++++++++------
 arch/ia64/sn/pci/pcibr/pcibr_provider.c | 11 +++++++----
 arch/ia64/sn/pci/tioca_provider.c       |  5 +++--
 4 files changed, 41 insertions(+), 14 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/include/xtalk/hubdev.h b/arch/ia64/sn/include/xtalk/hubdev.h
index 580a1c0403a7..ca4e250cdf27 100644
--- a/arch/ia64/sn/include/xtalk/hubdev.h
+++ b/arch/ia64/sn/include/xtalk/hubdev.h
@@ -34,7 +34,8 @@ struct sn_flush_device_list {
 	unsigned long sfdl_force_int_addr;
 	unsigned long sfdl_flush_value;
 	volatile unsigned long *sfdl_flush_addr;
-	uint64_t sfdl_persistent_busnum;
+	uint32_t sfdl_persistent_busnum;
+	uint32_t sfdl_persistent_segment;
 	struct pcibus_info *sfdl_pcibus_info;
 	spinlock_t sfdl_flush_lock;
 };
@@ -58,7 +59,8 @@ struct hubdev_info {
 
 	void				*hdi_nodepda;
 	void				*hdi_node_vertex;
-	void				*hdi_xtalk_vertex;
+	uint32_t			max_segment_number;
+	uint32_t			max_pcibus_number;
 };
 
 extern void hubdev_init_node(nodepda_t *, cnodeid_t);
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index a6649baf629a..829ea7985017 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -44,6 +44,9 @@ int sn_ioif_inited = 0;		/* SN I/O infrastructure initialized? */
 
 struct sn_pcibus_provider *sn_pci_provider[PCIIO_ASIC_MAX_TYPES];	/* indexed by asic type */
 
+static int max_segment_number = 0; /* Default highest segment number */
+static int max_pcibus_number = 255; /* Default highest pci bus number */
+
 /*
  * Hooks and struct for unsupported pci providers
  */
@@ -157,13 +160,28 @@ static void sn_fixup_ionodes(void)
 	uint64_t nasid;
 	int i, widget;
 
+	/*
+	 * Get SGI Specific HUB chipset information.
+	 * Inform Prom that this kernel can support domain bus numbering.
+	 */
 	for (i = 0; i < numionodes; i++) {
 		hubdev = (struct hubdev_info *)(NODEPDA(i)->pdinfo);
 		nasid = cnodeid_to_nasid(i);
+		hubdev->max_segment_number = 0xffffffff;
+		hubdev->max_pcibus_number = 0xff;
 		status = sal_get_hubdev_info(nasid, (uint64_t) __pa(hubdev));
 		if (status)
 			continue;
 
+		/* Save the largest Domain and pcibus numbers found. */
+		if (hubdev->max_segment_number) {
+			/*
+			 * Dealing with a Prom that supports segments.
+			 */
+			max_segment_number = hubdev->max_segment_number;
+			max_pcibus_number = hubdev->max_pcibus_number;
+		}
+
 		/* Attach the error interrupt handlers */
 		if (nasid & 1)
 			ice_error_init(hubdev);
@@ -229,7 +247,7 @@ void sn_pci_unfixup_slot(struct pci_dev *dev)
 void sn_pci_fixup_slot(struct pci_dev *dev)
 {
 	int idx;
-	int segment = 0;
+	int segment = pci_domain_nr(dev->bus);
 	int status = 0;
 	struct pcibus_bussoft *bs;
  	struct pci_bus *host_pci_bus;
@@ -282,9 +300,9 @@ void sn_pci_fixup_slot(struct pci_dev *dev)
  	 * PCI host_pci_dev struct and set up host bus linkages
  	 */
 
- 	bus_no = SN_PCIDEV_INFO(dev)->pdi_slot_host_handle >> 32;
+ 	bus_no = (SN_PCIDEV_INFO(dev)->pdi_slot_host_handle >> 32) & 0xff;
  	devfn = SN_PCIDEV_INFO(dev)->pdi_slot_host_handle & 0xffffffff;
- 	host_pci_bus = pci_find_bus(pci_domain_nr(dev->bus), bus_no);
+ 	host_pci_bus = pci_find_bus(segment, bus_no);
  	host_pci_dev = pci_get_slot(host_pci_bus, devfn);
 
 	SN_PCIDEV_INFO(dev)->host_pci_dev = host_pci_dev;
@@ -332,6 +350,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
 	prom_bussoft_ptr = __va(prom_bussoft_ptr);
 
  	controller = kcalloc(1,sizeof(struct pci_controller), GFP_KERNEL);
+	controller->segment = segment;
  	if (!controller)
  		BUG();
 
@@ -387,7 +406,7 @@ void sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
 	if (controller->node >= num_online_nodes()) {
 		struct pcibus_bussoft *b = SN_PCIBUS_BUSSOFT(bus);
 
-		printk(KERN_WARNING "Device ASIC=%u XID=%u PBUSNUM=%lu"
+		printk(KERN_WARNING "Device ASIC=%u XID=%u PBUSNUM=%u"
 				    "L_IO=%lx L_MEM=%lx BASE=%lx\n",
 			b->bs_asic_type, b->bs_xid, b->bs_persist_busnum,
 			b->bs_legacy_io, b->bs_legacy_mem, b->bs_base);
@@ -442,6 +461,7 @@ sn_sysdata_free_start:
 static int __init sn_pci_init(void)
 {
 	int i = 0;
+	int j = 0;
 	struct pci_dev *pci_dev = NULL;
 	extern void sn_init_cpei_timer(void);
 #ifdef CONFIG_PROC_FS
@@ -476,8 +496,9 @@ static int __init sn_pci_init(void)
 #endif
 
 	/* busses are not known yet ... */
-	for (i = 0; i < PCI_BUSES_TO_SCAN; i++)
-		sn_pci_controller_fixup(0, i, NULL);
+	for (i = 0; i <= max_segment_number; i++)
+		for (j = 0; j <= max_pcibus_number; j++)
+			sn_pci_controller_fixup(i, j, NULL);
 
 	/*
 	 * Generic Linux PCI Layer has created the pci_bus and pci_dev 
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_provider.c b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
index b95e928636a1..dfaf01e5be80 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_provider.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
@@ -60,7 +60,7 @@ static int sal_pcibr_error_interrupt(struct pcibus_info *soft)
 	ret_stuff.status = 0;
 	ret_stuff.v0 = 0;
 
-	segment = 0;
+	segment = soft->pbi_buscommon.bs_persist_segment;
 	busnum = soft->pbi_buscommon.bs_persist_busnum;
 	SAL_CALL_NOLOCK(ret_stuff,
 			(u64) SN_SAL_IOIF_ERROR_INTERRUPT,
@@ -142,9 +142,12 @@ pcibr_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 			     j++, sn_flush_device_list++) {
 				if (sn_flush_device_list->sfdl_slot == -1)
 					continue;
-				if (sn_flush_device_list->
-				    sfdl_persistent_busnum ==
-				    soft->pbi_buscommon.bs_persist_busnum)
+				if ((sn_flush_device_list->
+				     sfdl_persistent_segment ==
+				     soft->pbi_buscommon.bs_persist_segment) &&
+				     (sn_flush_device_list->
+				     sfdl_persistent_busnum ==
+				     soft->pbi_buscommon.bs_persist_busnum))
 					sn_flush_device_list->sfdl_pcibus_info =
 					    soft;
 			}
diff --git a/arch/ia64/sn/pci/tioca_provider.c b/arch/ia64/sn/pci/tioca_provider.c
index 5d76a7581465..2fef7c10d6e5 100644
--- a/arch/ia64/sn/pci/tioca_provider.c
+++ b/arch/ia64/sn/pci/tioca_provider.c
@@ -559,7 +559,7 @@ tioca_error_intr_handler(int irq, void *arg, struct pt_regs *pt)
 	ret_stuff.status = 0;
 	ret_stuff.v0 = 0;
 
-	segment = 0;
+	segment = soft->ca_common.bs_persist_segment;
 	busnum = soft->ca_common.bs_persist_busnum;
 
 	SAL_CALL_NOLOCK(ret_stuff,
@@ -622,7 +622,8 @@ tioca_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	    nasid_to_cnodeid(tioca_common->ca_closest_nasid);
 	tioca_common->ca_kernel_private = (uint64_t) tioca_kern;
 
-	bus = pci_find_bus(0, tioca_common->ca_common.bs_persist_busnum);
+	bus = pci_find_bus(tioca_common->ca_common.bs_persist_segment,
+		tioca_common->ca_common.bs_persist_busnum);
 	BUG_ON(!bus);
 	tioca_kern->ca_devices = &bus->devices;
 
-- 
cgit v1.2.3


From 89963d16dc50a5d91ed09914a1232d59e6461fd6 Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Wed, 3 Aug 2005 14:06:00 -0700
Subject: [IA64-SGI] altix: cosmetic rename of SGI_PCIBR_ERROR

Cosmetic altix patch to rename SGI_PCIBR_ERROR to something more generic and
remove a duplicate #define.

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/pci/pcibr/pcibr_provider.c | 2 +-
 include/asm-ia64/sn/intr.h              | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/pci/pcibr/pcibr_provider.c b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
index dfaf01e5be80..f8cfe8acdc3c 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_provider.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
@@ -115,7 +115,7 @@ pcibr_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	/*
 	 * register the bridge's error interrupt handler
 	 */
-	if (request_irq(SGI_PCIBR_ERROR, (void *)pcibr_error_intr_handler,
+	if (request_irq(SGI_PCIASIC_ERROR, (void *)pcibr_error_intr_handler,
 			SA_SHIRQ, "PCIBR error", (void *)(soft))) {
 		printk(KERN_WARNING
 		       "pcibr cannot allocate interrupt for error handler\n");
diff --git a/include/asm-ia64/sn/intr.h b/include/asm-ia64/sn/intr.h
index e190dd4213d5..e35074f526d9 100644
--- a/include/asm-ia64/sn/intr.h
+++ b/include/asm-ia64/sn/intr.h
@@ -12,13 +12,12 @@
 #include <linux/rcupdate.h>
 
 #define SGI_UART_VECTOR		(0xe9)
-#define SGI_PCIBR_ERROR		(0x33)
 
 /* Reserved IRQs : Note, not to exceed IA64_SN2_FIRST_DEVICE_VECTOR */
 #define SGI_XPC_ACTIVATE                (0x30)
 #define SGI_II_ERROR                    (0x31)
 #define SGI_XBOW_ERROR                  (0x32)
-#define SGI_PCIBR_ERROR                 (0x33)
+#define SGI_PCIASIC_ERROR               (0x33)
 #define SGI_ACPI_SCI_INT                (0x34)
 #define SGI_TIOCA_ERROR                 (0x35)
 #define SGI_TIO_ERROR                   (0x36)
-- 
cgit v1.2.3


From 735e60f4c67823a3e01655c990296e2e56574885 Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Wed, 3 Aug 2005 14:06:00 -0700
Subject: [IA64-SGI] abstract force_interrupt() mechanism

Altix patch to abstract the force_interrupt() mechanism away from the
pcibr provider.

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/irq.c                  | 50 +++++++++++++++++-------------
 arch/ia64/sn/pci/pcibr/pcibr_provider.c    |  4 +++
 arch/ia64/sn/pci/tioca_provider.c          |  1 +
 include/asm-ia64/sn/pcibus_provider_defs.h |  1 +
 include/asm-ia64/sn/pda.h                  |  1 -
 5 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 84d276a14ecb..af061dba246f 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -317,6 +317,16 @@ void sn_irq_unfixup(struct pci_dev *pci_dev)
 	pci_dev_put(pci_dev);
 }
 
+static inline void
+sn_call_force_intr_provider(struct sn_irq_info *sn_irq_info)
+{
+	struct sn_pcibus_provider *pci_provider;
+
+	pci_provider = sn_pci_provider[sn_irq_info->irq_bridge_type];
+	if (pci_provider && pci_provider->force_interrupt)
+		(*pci_provider->force_interrupt)(sn_irq_info);
+}
+
 static void force_interrupt(int irq)
 {
 	struct sn_irq_info *sn_irq_info;
@@ -325,11 +335,9 @@ static void force_interrupt(int irq)
 		return;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[irq], list) {
-		if (IS_PCI_BRIDGE_ASIC(sn_irq_info->irq_bridge_type) &&
-		    (sn_irq_info->irq_bridge != NULL))
-			pcibr_force_interrupt(sn_irq_info);
-	}
+	list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[irq], list)
+		sn_call_force_intr_provider(sn_irq_info);
+
 	rcu_read_unlock();
 }
 
@@ -351,6 +359,14 @@ static void sn_check_intr(int irq, struct sn_irq_info *sn_irq_info)
 	struct pcidev_info *pcidev_info;
 	struct pcibus_info *pcibus_info;
 
+	/*
+	 * Bridge types attached to TIO (anything but PIC) do not need this WAR
+	 * since they do not target Shub II interrupt registers.  If that
+	 * ever changes, this check needs to accomodate.
+	 */
+	if (sn_irq_info->irq_bridge_type != PCIIO_ASIC_TYPE_PIC)
+		return;
+
 	pcidev_info = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
 	if (!pcidev_info)
 		return;
@@ -377,16 +393,12 @@ static void sn_check_intr(int irq, struct sn_irq_info *sn_irq_info)
 		break;
 	}
 	if (!test_bit(irr_bit, &irr_reg)) {
-		if (!test_bit(irq, pda->sn_soft_irr)) {
-			if (!test_bit(irq, pda->sn_in_service_ivecs)) {
-				regval &= 0xff;
-				if (sn_irq_info->irq_int_bit & regval &
-				    sn_irq_info->irq_last_intr) {
-					regval &=
-					    ~(sn_irq_info->
-					      irq_int_bit & regval);
-					pcibr_force_interrupt(sn_irq_info);
-				}
+		if (!test_bit(irq, pda->sn_in_service_ivecs)) {
+			regval &= 0xff;
+			if (sn_irq_info->irq_int_bit & regval &
+			    sn_irq_info->irq_last_intr) {
+				regval &= ~(sn_irq_info->irq_int_bit & regval);
+				sn_call_force_intr_provider(sn_irq_info);
 			}
 		}
 	}
@@ -404,13 +416,7 @@ void sn_lb_int_war_check(void)
 	rcu_read_lock();
 	for (i = pda->sn_first_irq; i <= pda->sn_last_irq; i++) {
 		list_for_each_entry_rcu(sn_irq_info, sn_irq_lh[i], list) {
-			/*
-			 * Only call for PCI bridges that are fully
-			 * initialized.
-			 */
-			if (IS_PCI_BRIDGE_ASIC(sn_irq_info->irq_bridge_type) &&
-			    (sn_irq_info->irq_bridge != NULL))
-				sn_check_intr(i, sn_irq_info);
+			sn_check_intr(i, sn_irq_info);
 		}
 	}
 	rcu_read_unlock();
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_provider.c b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
index f8cfe8acdc3c..ff9c7de925d1 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_provider.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
@@ -178,6 +178,9 @@ void pcibr_force_interrupt(struct sn_irq_info *sn_irq_info)
 	struct pcibus_info *pcibus_info;
 	int bit = sn_irq_info->irq_int_bit;
 
+	if (! sn_irq_info->irq_bridge)
+		return;
+
 	pcidev_info = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
 	if (pcidev_info) {
 		pcibus_info =
@@ -222,6 +225,7 @@ struct sn_pcibus_provider pcibr_provider = {
 	.dma_map_consistent = pcibr_dma_map_consistent,
 	.dma_unmap = pcibr_dma_unmap,
 	.bus_fixup = pcibr_bus_fixup,
+	.force_interrupt = pcibr_force_interrupt
 };
 
 int
diff --git a/arch/ia64/sn/pci/tioca_provider.c b/arch/ia64/sn/pci/tioca_provider.c
index 2fef7c10d6e5..4ea04cfa30ff 100644
--- a/arch/ia64/sn/pci/tioca_provider.c
+++ b/arch/ia64/sn/pci/tioca_provider.c
@@ -657,6 +657,7 @@ static struct sn_pcibus_provider tioca_pci_interfaces = {
 	.dma_map_consistent = tioca_dma_map,
 	.dma_unmap = tioca_dma_unmap,
 	.bus_fixup = tioca_bus_fixup,
+	.force_interrupt = NULL
 };
 
 /**
diff --git a/include/asm-ia64/sn/pcibus_provider_defs.h b/include/asm-ia64/sn/pcibus_provider_defs.h
index b01b21eec55a..5c3ba5708e82 100644
--- a/include/asm-ia64/sn/pcibus_provider_defs.h
+++ b/include/asm-ia64/sn/pcibus_provider_defs.h
@@ -48,6 +48,7 @@ struct sn_pcibus_provider {
 	dma_addr_t	(*dma_map_consistent)(struct pci_dev *, unsigned long, size_t);
 	void		(*dma_unmap)(struct pci_dev *, dma_addr_t, int);
 	void *		(*bus_fixup)(struct pcibus_bussoft *, struct pci_controller *);
+ 	void		(*force_interrupt)(struct sn_irq_info *);
 };
 
 extern struct sn_pcibus_provider *sn_pci_provider[];
diff --git a/include/asm-ia64/sn/pda.h b/include/asm-ia64/sn/pda.h
index ea5590c76ca4..1c5108d44d8b 100644
--- a/include/asm-ia64/sn/pda.h
+++ b/include/asm-ia64/sn/pda.h
@@ -39,7 +39,6 @@ typedef struct pda_s {
 	unsigned long pio_write_status_val;
 	volatile unsigned long *pio_shub_war_cam_addr;
 
-	unsigned long	sn_soft_irr[4];
 	unsigned long	sn_in_service_ivecs[4];
 	int		sn_lb_int_war_ticks;
 	int		sn_last_irq;
-- 
cgit v1.2.3


From 5b53ed1f2ed6c85e2b1c39d97cc112ea32004609 Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Wed, 3 Aug 2005 14:06:00 -0700
Subject: [IA64-SGI] add support for TIO huge-window

Altix patch to add TIO "huge-window" address support to sn_dma_flush().

Update copyright in affected files.

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/include/tio.h          |  6 +++-
 arch/ia64/sn/include/xtalk/hubdev.h |  5 +++-
 arch/ia64/sn/pci/pcibr/pcibr_dma.c  | 55 ++++++++++++++++++++++---------------
 include/asm-ia64/sn/addrs.h         |  5 ++--
 4 files changed, 45 insertions(+), 26 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/include/tio.h b/arch/ia64/sn/include/tio.h
index 0139124dd54a..6b2e7b75eb19 100644
--- a/arch/ia64/sn/include/tio.h
+++ b/arch/ia64/sn/include/tio.h
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_IA64_SN_TIO_H
@@ -26,6 +26,10 @@
 #define TIO_ITTE_VALID_MASK	0x1
 #define TIO_ITTE_VALID_SHIFT	16
 
+#define TIO_ITTE_WIDGET(itte) \
+	(((itte) >> TIO_ITTE_WIDGET_SHIFT) & TIO_ITTE_WIDGET_MASK)
+#define TIO_ITTE_VALID(itte) \
+	(((itte) >> TIO_ITTE_VALID_SHIFT) & TIO_ITTE_VALID_MASK)
 
 #define TIO_ITTE_PUT(nasid, bigwin, widget, addr, valid) \
         REMOTE_HUB_S((nasid), TIO_ITTE(bigwin), \
diff --git a/arch/ia64/sn/include/xtalk/hubdev.h b/arch/ia64/sn/include/xtalk/hubdev.h
index ca4e250cdf27..71c2b271b4c6 100644
--- a/arch/ia64/sn/include/xtalk/hubdev.h
+++ b/arch/ia64/sn/include/xtalk/hubdev.h
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 1992 - 1997, 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved.
  */
 #ifndef _ASM_IA64_SN_XTALK_HUBDEV_H
 #define _ASM_IA64_SN_XTALK_HUBDEV_H
@@ -16,6 +16,9 @@
 #define IIO_ITTE_WIDGET_MASK    ((1<<IIO_ITTE_WIDGET_BITS)-1)
 #define IIO_ITTE_WIDGET_SHIFT   8
 
+#define IIO_ITTE_WIDGET(itte)	\
+	(((itte) >> IIO_ITTE_WIDGET_SHIFT) & IIO_ITTE_WIDGET_MASK)
+
 /*
  * Use the top big window as a surrogate for the first small window
  */
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index b058dc2a0b9d..ae455b6b1897 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2001-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2001-2005 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/types.h>
@@ -215,8 +215,8 @@ void sn_dma_flush(uint64_t addr)
 	int is_tio;
 	int wid_num;
 	int i, j;
-	int bwin;
 	uint64_t flags;
+	uint64_t itte;
 	struct hubdev_info *hubinfo;
 	volatile struct sn_flush_device_list *p;
 	struct sn_flush_nasid_entry *flush_nasid_list;
@@ -233,31 +233,36 @@ void sn_dma_flush(uint64_t addr)
 	if (!hubinfo) {
 		BUG();
 	}
-	is_tio = (nasid & 1);
-	if (is_tio) {
-		wid_num = TIO_SWIN_WIDGETNUM(addr);
-		bwin = TIO_BWIN_WINDOWNUM(addr);
-	} else {
-		wid_num = SWIN_WIDGETNUM(addr);
-		bwin = BWIN_WINDOWNUM(addr);
-	}
 
 	flush_nasid_list = &hubinfo->hdi_flush_nasid_list;
 	if (flush_nasid_list->widget_p == NULL)
 		return;
-	if (bwin > 0) {
-		uint64_t itte = flush_nasid_list->iio_itte[bwin];
 
-		if (is_tio) {
-			wid_num = (itte >> TIO_ITTE_WIDGET_SHIFT) &
-			    TIO_ITTE_WIDGET_MASK;
-		} else {
-			wid_num = (itte >> IIO_ITTE_WIDGET_SHIFT) &
-			    IIO_ITTE_WIDGET_MASK;
-		}
+	is_tio = (nasid & 1);
+	if (is_tio) {
+		int itte_index;
+
+		if (TIO_HWIN(addr))
+			itte_index = 0;
+		else if (TIO_BWIN_WINDOWNUM(addr))
+			itte_index = TIO_BWIN_WINDOWNUM(addr);
+		else
+			itte_index = -1;
+
+		if (itte_index >= 0) {
+			itte = flush_nasid_list->iio_itte[itte_index];
+			if (! TIO_ITTE_VALID(itte))
+				return;
+			wid_num = TIO_ITTE_WIDGET(itte);
+		} else
+			wid_num = TIO_SWIN_WIDGETNUM(addr);
+	} else {
+		if (BWIN_WINDOWNUM(addr)) {
+			itte = flush_nasid_list->iio_itte[BWIN_WINDOWNUM(addr)];
+			wid_num = IIO_ITTE_WIDGET(itte);
+		} else
+			wid_num = SWIN_WIDGETNUM(addr);
 	}
-	if (flush_nasid_list->widget_p == NULL)
-		return;
 	if (flush_nasid_list->widget_p[wid_num] == NULL)
 		return;
 	p = &flush_nasid_list->widget_p[wid_num][0];
@@ -283,9 +288,15 @@ void sn_dma_flush(uint64_t addr)
 	/*
 	 * For TIOCP use the Device(x) Write Request Buffer Flush Bridge
 	 * register since it ensures the data has entered the coherence
-	 * domain, unlike PIC
+	 * domain, unlike PIC.
 	 */
 	if (is_tio) {
+		/*
+	 	 * Note:  devices behind TIOCE should never be matched in the
+		 * above code, and so the following code is PIC/CP centric.
+		 * If CE ever needs the sn_dma_flush mechanism, we will have
+		 * to account for that here and in tioce_bus_fixup().
+	 	 */
 		uint32_t tio_id = REMOTE_HUB_L(nasid, TIO_NODE_ID);
 		uint32_t revnum = XWIDGET_PART_REV_NUM(tio_id);
 
diff --git a/include/asm-ia64/sn/addrs.h b/include/asm-ia64/sn/addrs.h
index 103d745dc5f2..b6e85e454456 100644
--- a/include/asm-ia64/sn/addrs.h
+++ b/include/asm-ia64/sn/addrs.h
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (c) 1992-1999,2001-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) 1992-1999,2001-2005 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_IA64_SN_ADDRS_H
@@ -191,7 +191,8 @@
 #define TIO_BWIN_WINDOW_SELECT_MASK	0x7
 #define TIO_BWIN_WINDOWNUM(x)		(((x) >> TIO_BWIN_SIZE_BITS) & TIO_BWIN_WINDOW_SELECT_MASK)
 
-
+#define TIO_HWIN_SHIFT_BITS		33
+#define TIO_HWIN(x)			(NODE_OFFSET(x) >> TIO_HWIN_SHIFT_BITS)
 
 /*
  * The following definitions pertain to the IO special address
-- 
cgit v1.2.3


From c9221da9f2796f082642c3498edb2c8783ad4774 Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Wed, 3 Aug 2005 14:07:00 -0700
Subject: [IA64-SGI] sn pci provider for TIOCE (pci

Altix patch to add an SN pci provider for TIOCE, which is SGI's
PCI Express implementation.

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/io_init.c              |   2 +
 arch/ia64/sn/pci/Makefile                  |   2 +-
 arch/ia64/sn/pci/tioce_provider.c          | 733 ++++++++++++++++++++++++++++
 include/asm-ia64/sn/pcibus_provider_defs.h |   3 +-
 include/asm-ia64/sn/tioce.h                | 740 +++++++++++++++++++++++++++++
 include/asm-ia64/sn/tioce_provider.h       |  66 +++
 6 files changed, 1544 insertions(+), 2 deletions(-)
 create mode 100644 arch/ia64/sn/pci/tioce_provider.c
 create mode 100644 include/asm-ia64/sn/tioce.h
 create mode 100644 include/asm-ia64/sn/tioce_provider.h

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index 829ea7985017..d1fc09b1d518 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -18,6 +18,7 @@
 #include <asm/sn/simulator.h>
 #include <asm/sn/sn_sal.h>
 #include <asm/sn/tioca_provider.h>
+#include <asm/sn/tioce_provider.h>
 #include "xtalk/hubdev.h"
 #include "xtalk/xwidgetdev.h"
 
@@ -481,6 +482,7 @@ static int __init sn_pci_init(void)
 
 	pcibr_init_provider();
 	tioca_init_provider();
+	tioce_init_provider();
 
 	/*
 	 * This is needed to avoid bounce limit checks in the blk layer
diff --git a/arch/ia64/sn/pci/Makefile b/arch/ia64/sn/pci/Makefile
index 2f915bce25f9..321576b1b425 100644
--- a/arch/ia64/sn/pci/Makefile
+++ b/arch/ia64/sn/pci/Makefile
@@ -7,4 +7,4 @@
 #
 # Makefile for the sn pci general routines.
 
-obj-y := pci_dma.o tioca_provider.o pcibr/ 
+obj-y := pci_dma.o tioca_provider.o tioce_provider.o pcibr/
diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
new file mode 100644
index 000000000000..d9081369cf96
--- /dev/null
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -0,0 +1,733 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2003-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <asm/sn/sn_sal.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/pcidev.h>
+#include <asm/sn/pcibus_provider_defs.h>
+#include <asm/sn/tioce_provider.h>
+
+/**
+ * Bus address ranges for the 5 flavors of TIOCE DMA
+ */
+
+#define TIOCE_D64_MIN	0x8000000000000000UL
+#define TIOCE_D64_MAX	0xffffffffffffffffUL
+#define TIOCE_D64_ADDR(a)	((a) >= TIOCE_D64_MIN)
+
+#define TIOCE_D32_MIN	0x0000000080000000UL
+#define TIOCE_D32_MAX	0x00000000ffffffffUL
+#define TIOCE_D32_ADDR(a)	((a) >= TIOCE_D32_MIN && (a) <= TIOCE_D32_MAX)
+
+#define TIOCE_M32_MIN	0x0000000000000000UL
+#define TIOCE_M32_MAX	0x000000007fffffffUL
+#define TIOCE_M32_ADDR(a)	((a) >= TIOCE_M32_MIN && (a) <= TIOCE_M32_MAX)
+
+#define TIOCE_M40_MIN	0x0000004000000000UL
+#define TIOCE_M40_MAX	0x0000007fffffffffUL
+#define TIOCE_M40_ADDR(a)	((a) >= TIOCE_M40_MIN && (a) <= TIOCE_M40_MAX)
+
+#define TIOCE_M40S_MIN	0x0000008000000000UL
+#define TIOCE_M40S_MAX	0x000000ffffffffffUL
+#define TIOCE_M40S_ADDR(a)	((a) >= TIOCE_M40S_MIN && (a) <= TIOCE_M40S_MAX)
+
+/*
+ * ATE manipulation macros.
+ */
+
+#define ATE_PAGESHIFT(ps)	(__ffs(ps))
+#define ATE_PAGEMASK(ps)	((ps)-1)
+
+#define ATE_PAGE(x, ps) ((x) >> ATE_PAGESHIFT(ps))
+#define ATE_NPAGES(start, len, pagesize) \
+	(ATE_PAGE((start)+(len)-1, pagesize) - ATE_PAGE(start, pagesize) + 1)
+
+#define ATE_VALID(ate)	((ate) & (1UL << 63))
+#define ATE_MAKE(addr, ps) (((addr) & ~ATE_PAGEMASK(ps)) | (1UL << 63))
+
+/*
+ * Flavors of ate-based mapping supported by tioce_alloc_map()
+ */
+
+#define TIOCE_ATE_M32	1
+#define TIOCE_ATE_M40	2
+#define TIOCE_ATE_M40S	3
+
+#define KB(x)	((x) << 10)
+#define MB(x)	((x) << 20)
+#define GB(x)	((x) << 30)
+
+/**
+ * tioce_dma_d64 - create a DMA mapping using 64-bit direct mode
+ * @ct_addr: system coretalk address
+ *
+ * Map @ct_addr into 64-bit CE bus space.  No device context is necessary
+ * and no CE mapping are consumed.
+ *
+ * Bits 53:0 come from the coretalk address.  The remaining bits are set as
+ * follows:
+ *
+ * 63    - must be 1 to indicate d64 mode to CE hardware
+ * 62    - barrier bit ... controlled with tioce_dma_barrier()
+ * 61    - 0 since this is not an MSI transaction
+ * 60:54 - reserved, MBZ
+ */
+static uint64_t
+tioce_dma_d64(unsigned long ct_addr)
+{
+	uint64_t bus_addr;
+
+	bus_addr = ct_addr | (1UL << 63);
+
+	return bus_addr;
+}
+
+/**
+ * pcidev_to_tioce - return misc ce related pointers given a pci_dev
+ * @pci_dev: pci device context
+ * @base: ptr to store struct tioce_mmr * for the CE holding this device
+ * @kernel: ptr to store struct tioce_kernel * for the CE holding this device
+ * @port: ptr to store the CE port number that this device is on
+ *
+ * Return pointers to various CE-related structures for the CE upstream of
+ * @pci_dev.
+ */
+static inline void
+pcidev_to_tioce(struct pci_dev *pdev, struct tioce **base,
+		struct tioce_kernel **kernel, int *port)
+{
+	struct pcidev_info *pcidev_info;
+	struct tioce_common *ce_common;
+	struct tioce_kernel *ce_kernel;
+
+	pcidev_info = SN_PCIDEV_INFO(pdev);
+	ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
+	ce_kernel = (struct tioce_kernel *)ce_common->ce_kernel_private;
+
+	if (base)
+		*base = (struct tioce *)ce_common->ce_pcibus.bs_base;
+	if (kernel)
+		*kernel = ce_kernel;
+
+	/*
+	 * we use port as a zero-based value internally, even though the
+	 * documentation is 1-based.
+	 */
+	if (port)
+		*port =
+		    (pdev->bus->number < ce_kernel->ce_port1_secondary) ? 0 : 1;
+}
+
+/**
+ * tioce_alloc_map - Given a coretalk address, map it to pcie bus address
+ * space using one of the various ATE-based address modes.
+ * @ce_kern: tioce context
+ * @type: map mode to use
+ * @port: 0-based port that the requesting device is downstream of
+ * @ct_addr: the coretalk address to map
+ * @len: number of bytes to map
+ *
+ * Given the addressing type, set up various paramaters that define the
+ * ATE pool to use.  Search for a contiguous block of entries to cover the
+ * length, and if enough resources exist, fill in the ATE's and construct a
+ * tioce_dmamap struct to track the mapping.
+ */
+static uint64_t
+tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
+		uint64_t ct_addr, int len)
+{
+	int i;
+	int j;
+	int first;
+	int last;
+	int entries;
+	int nates;
+	int pagesize;
+	uint64_t *ate_shadow;
+	uint64_t *ate_reg;
+	uint64_t addr;
+	struct tioce *ce_mmr;
+	uint64_t bus_base;
+	struct tioce_dmamap *map;
+
+	ce_mmr = (struct tioce *)ce_kern->ce_common->ce_pcibus.bs_base;
+
+	switch (type) {
+	case TIOCE_ATE_M32:
+		/*
+		 * The first 64 entries of the ate3240 pool are dedicated to
+		 * super-page (TIOCE_ATE_M40S) mode.
+		 */
+		first = 64;
+		entries = TIOCE_NUM_M3240_ATES - 64;
+		ate_shadow = ce_kern->ce_ate3240_shadow;
+		ate_reg = ce_mmr->ce_ure_ate3240;
+		pagesize = ce_kern->ce_ate3240_pagesize;
+		bus_base = TIOCE_M32_MIN;
+		break;
+	case TIOCE_ATE_M40:
+		first = 0;
+		entries = TIOCE_NUM_M40_ATES;
+		ate_shadow = ce_kern->ce_ate40_shadow;
+		ate_reg = ce_mmr->ce_ure_ate40;
+		pagesize = MB(64);
+		bus_base = TIOCE_M40_MIN;
+		break;
+	case TIOCE_ATE_M40S:
+		/*
+		 * ate3240 entries 0-31 are dedicated to port1 super-page
+		 * mappings.  ate3240 entries 32-63 are dedicated to port2.
+		 */
+		first = port * 32;
+		entries = 32;
+		ate_shadow = ce_kern->ce_ate3240_shadow;
+		ate_reg = ce_mmr->ce_ure_ate3240;
+		pagesize = GB(16);
+		bus_base = TIOCE_M40S_MIN;
+		break;
+	default:
+		return 0;
+	}
+
+	nates = ATE_NPAGES(ct_addr, len, pagesize);
+	if (nates > entries)
+		return 0;
+
+	last = first + entries - nates;
+	for (i = first; i <= last; i++) {
+		if (ATE_VALID(ate_shadow[i]))
+			continue;
+
+		for (j = i; j < i + nates; j++)
+			if (ATE_VALID(ate_shadow[j]))
+				break;
+
+		if (j >= i + nates)
+			break;
+	}
+
+	if (i > last)
+		return 0;
+
+	map = kcalloc(1, sizeof(struct tioce_dmamap), GFP_ATOMIC);
+	if (!map)
+		return 0;
+
+	addr = ct_addr;
+	for (j = 0; j < nates; j++) {
+		uint64_t ate;
+
+		ate = ATE_MAKE(addr, pagesize);
+		ate_shadow[i + j] = ate;
+		ate_reg[i + j] = ate;
+		addr += pagesize;
+	}
+
+	map->refcnt = 1;
+	map->nbytes = nates * pagesize;
+	map->ct_start = ct_addr & ~ATE_PAGEMASK(pagesize);
+	map->pci_start = bus_base + (i * pagesize);
+	map->ate_hw = &ate_reg[i];
+	map->ate_shadow = &ate_shadow[i];
+	map->ate_count = nates;
+
+	list_add(&map->ce_dmamap_list, &ce_kern->ce_dmamap_list);
+
+	return (map->pci_start + (ct_addr - map->ct_start));
+}
+
+/**
+ * tioce_dma_d32 - create a DMA mapping using 32-bit direct mode
+ * @pdev: linux pci_dev representing the function
+ * @paddr: system physical address
+ *
+ * Map @paddr into 32-bit bus space of the CE associated with @pcidev_info.
+ */
+static uint64_t
+tioce_dma_d32(struct pci_dev *pdev, uint64_t ct_addr)
+{
+	int dma_ok;
+	int port;
+	struct tioce *ce_mmr;
+	struct tioce_kernel *ce_kern;
+	uint64_t ct_upper;
+	uint64_t ct_lower;
+	dma_addr_t bus_addr;
+
+	ct_upper = ct_addr & ~0x3fffffffUL;
+	ct_lower = ct_addr & 0x3fffffffUL;
+
+	pcidev_to_tioce(pdev, &ce_mmr, &ce_kern, &port);
+
+	if (ce_kern->ce_port[port].dirmap_refcnt == 0) {
+		volatile uint64_t tmp;
+
+		ce_kern->ce_port[port].dirmap_shadow = ct_upper;
+		ce_mmr->ce_ure_dir_map[port] = ct_upper;
+		tmp = ce_mmr->ce_ure_dir_map[port];
+		dma_ok = 1;
+	} else
+		dma_ok = (ce_kern->ce_port[port].dirmap_shadow == ct_upper);
+
+	if (dma_ok) {
+		ce_kern->ce_port[port].dirmap_refcnt++;
+		bus_addr = TIOCE_D32_MIN + ct_lower;
+	} else
+		bus_addr = 0;
+
+	return bus_addr;
+}
+
+/**
+ * tioce_dma_barrier - swizzle a TIOCE bus address to include or exclude
+ * the barrier bit.
+ * @bus_addr:  bus address to swizzle
+ *
+ * Given a TIOCE bus address, set the appropriate bit to indicate barrier
+ * attributes.
+ */
+static uint64_t
+tioce_dma_barrier(uint64_t bus_addr, int on)
+{
+	uint64_t barrier_bit;
+
+	/* barrier not supported in M40/M40S mode */
+	if (TIOCE_M40_ADDR(bus_addr) || TIOCE_M40S_ADDR(bus_addr))
+		return bus_addr;
+
+	if (TIOCE_D64_ADDR(bus_addr))
+		barrier_bit = (1UL << 62);
+	else			/* must be m32 or d32 */
+		barrier_bit = (1UL << 30);
+
+	return (on) ? (bus_addr | barrier_bit) : (bus_addr & ~barrier_bit);
+}
+
+/**
+ * tioce_dma_unmap - release CE mapping resources
+ * @pdev: linux pci_dev representing the function
+ * @bus_addr: bus address returned by an earlier tioce_dma_map
+ * @dir: mapping direction (unused)
+ *
+ * Locate mapping resources associated with @bus_addr and release them.
+ * For mappings created using the direct modes there are no resources
+ * to release.
+ */
+void
+tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
+{
+	int i;
+	int port;
+	struct tioce_kernel *ce_kern;
+	struct tioce *ce_mmr;
+	unsigned long flags;
+
+	bus_addr = tioce_dma_barrier(bus_addr, 0);
+	pcidev_to_tioce(pdev, &ce_mmr, &ce_kern, &port);
+
+	/* nothing to do for D64 */
+
+	if (TIOCE_D64_ADDR(bus_addr))
+		return;
+
+	spin_lock_irqsave(&ce_kern->ce_lock, flags);
+
+	if (TIOCE_D32_ADDR(bus_addr)) {
+		if (--ce_kern->ce_port[port].dirmap_refcnt == 0) {
+			ce_kern->ce_port[port].dirmap_shadow = 0;
+			ce_mmr->ce_ure_dir_map[port] = 0;
+		}
+	} else {
+		struct tioce_dmamap *map;
+
+		list_for_each_entry(map, &ce_kern->ce_dmamap_list,
+				    ce_dmamap_list) {
+			uint64_t last;
+
+			last = map->pci_start + map->nbytes - 1;
+			if (bus_addr >= map->pci_start && bus_addr <= last)
+				break;
+		}
+
+		if (&map->ce_dmamap_list == &ce_kern->ce_dmamap_list) {
+			printk(KERN_WARNING
+			       "%s:  %s - no map found for bus_addr 0x%lx\n",
+			       __FUNCTION__, pci_name(pdev), bus_addr);
+		} else if (--map->refcnt == 0) {
+			for (i = 0; i < map->ate_count; i++) {
+				map->ate_shadow[i] = 0;
+				map->ate_hw[i] = 0;
+			}
+
+			list_del(&map->ce_dmamap_list);
+			kfree(map);
+		}
+	}
+
+	spin_unlock_irqrestore(&ce_kern->ce_lock, flags);
+}
+
+/**
+ * tioce_do_dma_map - map pages for PCI DMA
+ * @pdev: linux pci_dev representing the function
+ * @paddr: host physical address to map
+ * @byte_count: bytes to map
+ *
+ * This is the main wrapper for mapping host physical pages to CE PCI space.
+ * The mapping mode used is based on the device's dma_mask.
+ */
+static uint64_t
+tioce_do_dma_map(struct pci_dev *pdev, uint64_t paddr, size_t byte_count,
+		 int barrier)
+{
+	unsigned long flags;
+	uint64_t ct_addr;
+	uint64_t mapaddr = 0;
+	struct tioce_kernel *ce_kern;
+	struct tioce_dmamap *map;
+	int port;
+	uint64_t dma_mask;
+
+	dma_mask = (barrier) ? pdev->dev.coherent_dma_mask : pdev->dma_mask;
+
+	/* cards must be able to address at least 31 bits */
+	if (dma_mask < 0x7fffffffUL)
+		return 0;
+
+	ct_addr = PHYS_TO_TIODMA(paddr);
+
+	/*
+	 * If the device can generate 64 bit addresses, create a D64 map.
+	 * Since this should never fail, bypass the rest of the checks.
+	 */
+	if (dma_mask == ~0UL) {
+		mapaddr = tioce_dma_d64(ct_addr);
+		goto dma_map_done;
+	}
+
+	pcidev_to_tioce(pdev, NULL, &ce_kern, &port);
+
+	spin_lock_irqsave(&ce_kern->ce_lock, flags);
+
+	/*
+	 * D64 didn't work ... See if we have an existing map that covers
+	 * this address range.  Must account for devices dma_mask here since
+	 * an existing map might have been done in a mode using more pci
+	 * address bits than this device can support.
+	 */
+	list_for_each_entry(map, &ce_kern->ce_dmamap_list, ce_dmamap_list) {
+		uint64_t last;
+
+		last = map->ct_start + map->nbytes - 1;
+		if (ct_addr >= map->ct_start &&
+		    ct_addr + byte_count - 1 <= last &&
+		    map->pci_start <= dma_mask) {
+			map->refcnt++;
+			mapaddr = map->pci_start + (ct_addr - map->ct_start);
+			break;
+		}
+	}
+
+	/*
+	 * If we don't have a map yet, and the card can generate 40
+	 * bit addresses, try the M40/M40S modes.  Note these modes do not
+	 * support a barrier bit, so if we need a consistent map these
+	 * won't work.
+	 */
+	if (!mapaddr && !barrier && dma_mask >= 0xffffffffffUL) {
+		/*
+		 * We have two options for 40-bit mappings:  16GB "super" ATE's
+		 * and 64MB "regular" ATE's.  We'll try both if needed for a
+		 * given mapping but which one we try first depends on the
+		 * size.  For requests >64MB, prefer to use a super page with
+		 * regular as the fallback. Otherwise, try in the reverse order.
+		 */
+
+		if (byte_count > MB(64)) {
+			mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40S,
+						  port, ct_addr, byte_count);
+			if (!mapaddr)
+				mapaddr =
+				    tioce_alloc_map(ce_kern, TIOCE_ATE_M40, -1,
+						    ct_addr, byte_count);
+		} else {
+			mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40, -1,
+						  ct_addr, byte_count);
+			if (!mapaddr)
+				mapaddr =
+				    tioce_alloc_map(ce_kern, TIOCE_ATE_M40S,
+						    port, ct_addr, byte_count);
+		}
+	}
+
+	/*
+	 * 32-bit direct is the next mode to try
+	 */
+	if (!mapaddr && dma_mask >= 0xffffffffUL)
+		mapaddr = tioce_dma_d32(pdev, ct_addr);
+
+	/*
+	 * Last resort, try 32-bit ATE-based map.
+	 */
+	if (!mapaddr)
+		mapaddr =
+		    tioce_alloc_map(ce_kern, TIOCE_ATE_M32, -1, ct_addr,
+				    byte_count);
+
+	spin_unlock_irqrestore(&ce_kern->ce_lock, flags);
+
+dma_map_done:
+	if (mapaddr & barrier)
+		mapaddr = tioce_dma_barrier(mapaddr, 1);
+
+	return mapaddr;
+}
+
+/**
+ * tioce_dma - standard pci dma map interface
+ * @pdev: pci device requesting the map
+ * @paddr: system physical address to map into pci space
+ * @byte_count: # bytes to map
+ *
+ * Simply call tioce_do_dma_map() to create a map with the barrier bit clear
+ * in the address.
+ */
+static uint64_t
+tioce_dma(struct pci_dev *pdev, uint64_t paddr, size_t byte_count)
+{
+	return tioce_do_dma_map(pdev, paddr, byte_count, 0);
+}
+
+/**
+ * tioce_dma_consistent - consistent pci dma map interface
+ * @pdev: pci device requesting the map
+ * @paddr: system physical address to map into pci space
+ * @byte_count: # bytes to map
+ *
+ * Simply call tioce_do_dma_map() to create a map with the barrier bit set
+ * in the address.
+ */ static uint64_t
+tioce_dma_consistent(struct pci_dev *pdev, uint64_t paddr, size_t byte_count)
+{
+	return tioce_do_dma_map(pdev, paddr, byte_count, 1);
+}
+
+/**
+ * tioce_error_intr_handler - SGI TIO CE error interrupt handler
+ * @irq: unused
+ * @arg: pointer to tioce_common struct for the given CE
+ * @pt: unused
+ *
+ * Handle a CE error interrupt.  Simply a wrapper around a SAL call which
+ * defers processing to the SGI prom.
+ */ static irqreturn_t
+tioce_error_intr_handler(int irq, void *arg, struct pt_regs *pt)
+{
+	struct tioce_common *soft = arg;
+	struct ia64_sal_retval ret_stuff;
+	ret_stuff.status = 0;
+	ret_stuff.v0 = 0;
+
+	SAL_CALL_NOLOCK(ret_stuff, (u64) SN_SAL_IOIF_ERROR_INTERRUPT,
+			soft->ce_pcibus.bs_persist_segment,
+			soft->ce_pcibus.bs_persist_busnum, 0, 0, 0, 0, 0);
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * tioce_kern_init - init kernel structures related to a given TIOCE
+ * @tioce_common: ptr to a cached tioce_common struct that originated in prom
+ */ static struct tioce_kernel *
+tioce_kern_init(struct tioce_common *tioce_common)
+{
+	int i;
+	uint32_t tmp;
+	struct tioce *tioce_mmr;
+	struct tioce_kernel *tioce_kern;
+
+	tioce_kern = kcalloc(1, sizeof(struct tioce_kernel), GFP_KERNEL);
+	if (!tioce_kern) {
+		return NULL;
+	}
+
+	tioce_kern->ce_common = tioce_common;
+	spin_lock_init(&tioce_kern->ce_lock);
+	INIT_LIST_HEAD(&tioce_kern->ce_dmamap_list);
+	tioce_common->ce_kernel_private = (uint64_t) tioce_kern;
+
+	/*
+	 * Determine the secondary bus number of the port2 logical PPB.
+	 * This is used to decide whether a given pci device resides on
+	 * port1 or port2.  Note:  We don't have enough plumbing set up
+	 * here to use pci_read_config_xxx() so use the raw_pci_ops vector.
+	 */
+
+	raw_pci_ops->read(tioce_common->ce_pcibus.bs_persist_segment,
+			  tioce_common->ce_pcibus.bs_persist_busnum,
+			  PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1, &tmp);
+	tioce_kern->ce_port1_secondary = (uint8_t) tmp;
+
+	/*
+	 * Set PMU pagesize to the largest size available, and zero out
+	 * the ate's.
+	 */
+
+	tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base;
+	tioce_mmr->ce_ure_page_map &= ~CE_URE_PAGESIZE_MASK;
+	tioce_mmr->ce_ure_page_map |= CE_URE_256K_PAGESIZE;
+	tioce_kern->ce_ate3240_pagesize = KB(256);
+
+	for (i = 0; i < TIOCE_NUM_M40_ATES; i++) {
+		tioce_kern->ce_ate40_shadow[i] = 0;
+		tioce_mmr->ce_ure_ate40[i] = 0;
+	}
+
+	for (i = 0; i < TIOCE_NUM_M3240_ATES; i++) {
+		tioce_kern->ce_ate3240_shadow[i] = 0;
+		tioce_mmr->ce_ure_ate3240[i] = 0;
+	}
+
+	return tioce_kern;
+}
+
+/**
+ * tioce_force_interrupt - implement altix force_interrupt() backend for CE
+ * @sn_irq_info: sn asic irq that we need an interrupt generated for
+ *
+ * Given an sn_irq_info struct, set the proper bit in ce_adm_force_int to
+ * force a secondary interrupt to be generated.  This is to work around an
+ * asic issue where there is a small window of opportunity for a legacy device
+ * interrupt to be lost.
+ */
+static void
+tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
+{
+	struct pcidev_info *pcidev_info;
+	struct tioce_common *ce_common;
+	struct tioce *ce_mmr;
+	uint64_t force_int_val;
+
+	if (!sn_irq_info->irq_bridge)
+		return;
+
+	if (sn_irq_info->irq_bridge_type != PCIIO_ASIC_TYPE_TIOCE)
+		return;
+
+	pcidev_info = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
+	if (!pcidev_info)
+		return;
+
+	ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
+	ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
+
+	/*
+	 * irq_int_bit is originally set up by prom, and holds the interrupt
+	 * bit shift (not mask) as defined by the bit definitions in the
+	 * ce_adm_int mmr.  These shifts are not the same for the
+	 * ce_adm_force_int register, so do an explicit mapping here to make
+	 * things clearer.
+	 */
+
+	switch (sn_irq_info->irq_int_bit) {
+	case CE_ADM_INT_PCIE_PORT1_DEV_A_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT1_DEV_A_SHFT;
+		break;
+	case CE_ADM_INT_PCIE_PORT1_DEV_B_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT1_DEV_B_SHFT;
+		break;
+	case CE_ADM_INT_PCIE_PORT1_DEV_C_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT1_DEV_C_SHFT;
+		break;
+	case CE_ADM_INT_PCIE_PORT1_DEV_D_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT1_DEV_D_SHFT;
+		break;
+	case CE_ADM_INT_PCIE_PORT2_DEV_A_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT2_DEV_A_SHFT;
+		break;
+	case CE_ADM_INT_PCIE_PORT2_DEV_B_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT2_DEV_B_SHFT;
+		break;
+	case CE_ADM_INT_PCIE_PORT2_DEV_C_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT2_DEV_C_SHFT;
+		break;
+	case CE_ADM_INT_PCIE_PORT2_DEV_D_SHFT:
+		force_int_val = 1UL << CE_ADM_FORCE_INT_PCIE_PORT2_DEV_D_SHFT;
+		break;
+	default:
+		return;
+	}
+	ce_mmr->ce_adm_force_int = force_int_val;
+}
+
+/**
+ * tioce_bus_fixup - perform final PCI fixup for a TIO CE bus
+ * @prom_bussoft: Common prom/kernel struct representing the bus
+ *
+ * Replicates the tioce_common pointed to by @prom_bussoft in kernel
+ * space.  Allocates and initializes a kernel-only area for a given CE,
+ * and sets up an irq for handling CE error interrupts.
+ *
+ * On successful setup, returns the kernel version of tioce_common back to
+ * the caller.
+ */
+static void *
+tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller)
+{
+	struct tioce_common *tioce_common;
+
+	/*
+	 * Allocate kernel bus soft and copy from prom.
+	 */
+
+	tioce_common = kcalloc(1, sizeof(struct tioce_common), GFP_KERNEL);
+	if (!tioce_common)
+		return NULL;
+
+	memcpy(tioce_common, prom_bussoft, sizeof(struct tioce_common));
+	tioce_common->ce_pcibus.bs_base |= __IA64_UNCACHED_OFFSET;
+
+	if (tioce_kern_init(tioce_common) == NULL) {
+		kfree(tioce_common);
+		return NULL;
+	}
+
+	if (request_irq(SGI_PCIASIC_ERROR,
+			tioce_error_intr_handler,
+			SA_SHIRQ, "TIOCE error", (void *)tioce_common))
+		printk(KERN_WARNING
+		       "%s:  Unable to get irq %d.  "
+		       "Error interrupts won't be routed for "
+		       "TIOCE bus %04x:%02x\n",
+		       __FUNCTION__, SGI_PCIASIC_ERROR,
+		       tioce_common->ce_pcibus.bs_persist_segment,
+		       tioce_common->ce_pcibus.bs_persist_busnum);
+
+	return tioce_common;
+}
+
+static struct sn_pcibus_provider tioce_pci_interfaces = {
+	.dma_map = tioce_dma,
+	.dma_map_consistent = tioce_dma_consistent,
+	.dma_unmap = tioce_dma_unmap,
+	.bus_fixup = tioce_bus_fixup,
+	.force_interrupt = tioce_force_interrupt
+};
+
+/**
+ * tioce_init_provider - init SN PCI provider ops for TIO CE
+ */
+int
+tioce_init_provider(void)
+{
+	sn_pci_provider[PCIIO_ASIC_TYPE_TIOCE] = &tioce_pci_interfaces;
+	return 0;
+}
diff --git a/include/asm-ia64/sn/pcibus_provider_defs.h b/include/asm-ia64/sn/pcibus_provider_defs.h
index 5c3ba5708e82..5a92f5149a94 100644
--- a/include/asm-ia64/sn/pcibus_provider_defs.h
+++ b/include/asm-ia64/sn/pcibus_provider_defs.h
@@ -18,8 +18,9 @@
 #define PCIIO_ASIC_TYPE_PIC	2
 #define PCIIO_ASIC_TYPE_TIOCP	3
 #define PCIIO_ASIC_TYPE_TIOCA	4
+#define PCIIO_ASIC_TYPE_TIOCE	5
 
-#define PCIIO_ASIC_MAX_TYPES	5
+#define PCIIO_ASIC_MAX_TYPES	6
 
 /*
  * Common pciio bus provider data.  There should be one of these as the
diff --git a/include/asm-ia64/sn/tioce.h b/include/asm-ia64/sn/tioce.h
new file mode 100644
index 000000000000..22879853e46c
--- /dev/null
+++ b/include/asm-ia64/sn/tioce.h
@@ -0,0 +1,740 @@
+/**************************************************************************
+ *                                                                        *
+ *  Unpublished copyright (c) 2005, Silicon Graphics, Inc.                *
+ *  THIS IS UNPUBLISHED CONFIDENTIAL AND PROPRIETARY SOURCE CODE OF SGI.  *
+ *                                                                        *
+ *  The copyright notice above does  not evidence any actual or intended  *
+ *  publication  or  disclosure  of  this source  code,  which  includes  *
+ *  information that is confidential  and/or proprietary, and is a trade  *
+ *  secret, of  Silicon Graphics, Inc.   ANY REPRODUCTION, MODIFICATION,  *
+ *  DISTRIBUTION, PUBLIC  PERFORMANCE, OR  PUBLIC DISPLAY OF  OR THROUGH  *
+ *  USE  OF THIS  SOURCE CODE  WITHOUT  THE EXPRESS  WRITTEN CONSENT  OF  *
+ *  SILICON GRAPHICS, INC.  IS  STRICTLY PROHIBITED, AND IN VIOLATION OF  *
+ *  APPLICABLE  LAWS   AND  INTERNATIONAL  TREATIES.    THE  RECEIPT  OR  *
+ *  POSSESSION OF  THIS SOURCE CODE AND/OR RELATED  INFORMATION DOES NOT  *
+ *  CONVEY OR IMPLY ANY RIGHTS  TO REPRODUCE, DISCLOSE OR DISTRIBUTE ITS  *
+ *  CONTENTS,  OR TO  MANUFACTURE, USE,  OR  SELL ANYTHING  THAT IT  MAY  *
+ *  DESCRIBE, IN WHOLE OR IN PART.                                        *
+ *                                                                        *
+ **************************************************************************/
+
+#ifndef __ASM_IA64_SN_TIOCE_H__
+#define __ASM_IA64_SN_TIOCE_H__
+
+/* CE ASIC part & mfgr information  */
+#define TIOCE_PART_NUM			0xCE00
+#define TIOCE_MFGR_NUM			0x36
+#define TIOCE_REV_A			0x1
+
+/* CE Virtual PPB Vendor/Device IDs */
+#define CE_VIRT_PPB_VENDOR_ID		0x10a9
+#define CE_VIRT_PPB_DEVICE_ID		0x4002
+
+/* CE Host Bridge Vendor/Device IDs */
+#define CE_HOST_BRIDGE_VENDOR_ID	0x10a9
+#define CE_HOST_BRIDGE_DEVICE_ID	0x4003
+
+
+#define TIOCE_NUM_M40_ATES		4096
+#define TIOCE_NUM_M3240_ATES		2048
+#define TIOCE_NUM_PORTS			2
+
+/*
+ * Register layout for TIOCE.  MMR offsets are shown at the far right of the
+ * structure definition.
+ */
+typedef volatile struct tioce {
+	/*
+	 * ADMIN : Administration Registers
+	 */
+	uint64_t	ce_adm_id;				/* 0x000000 */
+	uint64_t	ce_pad_000008;				/* 0x000008 */
+	uint64_t	ce_adm_dyn_credit_status;		/* 0x000010 */
+	uint64_t	ce_adm_last_credit_status;		/* 0x000018 */
+	uint64_t	ce_adm_credit_limit;			/* 0x000020 */
+	uint64_t	ce_adm_force_credit;			/* 0x000028 */
+	uint64_t	ce_adm_control;				/* 0x000030 */
+	uint64_t	ce_adm_mmr_chn_timeout;			/* 0x000038 */
+	uint64_t	ce_adm_ssp_ure_timeout;			/* 0x000040 */
+	uint64_t	ce_adm_ssp_dre_timeout;			/* 0x000048 */
+	uint64_t	ce_adm_ssp_debug_sel;			/* 0x000050 */
+	uint64_t	ce_adm_int_status;			/* 0x000058 */
+	uint64_t	ce_adm_int_status_alias;		/* 0x000060 */
+	uint64_t	ce_adm_int_mask;			/* 0x000068 */
+	uint64_t	ce_adm_int_pending;			/* 0x000070 */
+	uint64_t	ce_adm_force_int;			/* 0x000078 */
+	uint64_t	ce_adm_ure_ups_buf_barrier_flush;	/* 0x000080 */
+	uint64_t	ce_adm_int_dest[15];	    /* 0x000088 -- 0x0000F8 */
+	uint64_t	ce_adm_error_summary;			/* 0x000100 */
+	uint64_t	ce_adm_error_summary_alias;		/* 0x000108 */
+	uint64_t	ce_adm_error_mask;			/* 0x000110 */
+	uint64_t	ce_adm_first_error;			/* 0x000118 */
+	uint64_t	ce_adm_error_overflow;			/* 0x000120 */
+	uint64_t	ce_adm_error_overflow_alias;		/* 0x000128 */
+	uint64_t	ce_pad_000130[2];	    /* 0x000130 -- 0x000138 */
+	uint64_t	ce_adm_tnum_error;			/* 0x000140 */
+	uint64_t	ce_adm_mmr_err_detail;			/* 0x000148 */
+	uint64_t	ce_adm_msg_sram_perr_detail;		/* 0x000150 */
+	uint64_t	ce_adm_bap_sram_perr_detail;		/* 0x000158 */
+	uint64_t	ce_adm_ce_sram_perr_detail;		/* 0x000160 */
+	uint64_t	ce_adm_ce_credit_oflow_detail;		/* 0x000168 */
+	uint64_t	ce_adm_tx_link_idle_max_timer;		/* 0x000170 */
+	uint64_t	ce_adm_pcie_debug_sel;			/* 0x000178 */
+	uint64_t	ce_pad_000180[16];	    /* 0x000180 -- 0x0001F8 */
+
+	uint64_t	ce_adm_pcie_debug_sel_top;		/* 0x000200 */
+	uint64_t	ce_adm_pcie_debug_lat_sel_lo_top;	/* 0x000208 */
+	uint64_t	ce_adm_pcie_debug_lat_sel_hi_top;	/* 0x000210 */
+	uint64_t	ce_adm_pcie_debug_trig_sel_top;		/* 0x000218 */
+	uint64_t	ce_adm_pcie_debug_trig_lat_sel_lo_top;	/* 0x000220 */
+	uint64_t	ce_adm_pcie_debug_trig_lat_sel_hi_top;	/* 0x000228 */
+	uint64_t	ce_adm_pcie_trig_compare_top;		/* 0x000230 */
+	uint64_t	ce_adm_pcie_trig_compare_en_top;	/* 0x000238 */
+	uint64_t	ce_adm_ssp_debug_sel_top;		/* 0x000240 */
+	uint64_t	ce_adm_ssp_debug_lat_sel_lo_top;	/* 0x000248 */
+	uint64_t	ce_adm_ssp_debug_lat_sel_hi_top;	/* 0x000250 */
+	uint64_t	ce_adm_ssp_debug_trig_sel_top;		/* 0x000258 */
+	uint64_t	ce_adm_ssp_debug_trig_lat_sel_lo_top;	/* 0x000260 */
+	uint64_t	ce_adm_ssp_debug_trig_lat_sel_hi_top;	/* 0x000268 */
+	uint64_t	ce_adm_ssp_trig_compare_top;		/* 0x000270 */
+	uint64_t	ce_adm_ssp_trig_compare_en_top;		/* 0x000278 */
+	uint64_t	ce_pad_000280[48];	    /* 0x000280 -- 0x0003F8 */
+
+	uint64_t	ce_adm_bap_ctrl;			/* 0x000400 */
+	uint64_t	ce_pad_000408[127];	    /* 0x000408 -- 0x0007F8 */
+
+	uint64_t	ce_msg_buf_data63_0[35];    /* 0x000800 -- 0x000918 */
+	uint64_t	ce_pad_000920[29];	    /* 0x000920 -- 0x0009F8 */
+
+	uint64_t	ce_msg_buf_data127_64[35];  /* 0x000A00 -- 0x000B18 */
+	uint64_t	ce_pad_000B20[29];	    /* 0x000B20 -- 0x000BF8 */
+
+	uint64_t	ce_msg_buf_parity[35];	    /* 0x000C00 -- 0x000D18 */
+	uint64_t	ce_pad_000D20[29];	    /* 0x000D20 -- 0x000DF8 */
+
+	uint64_t	ce_pad_000E00[576];	    /* 0x000E00 -- 0x001FF8 */
+
+	/*
+	 * LSI : LSI's PCI Express Link Registers (Link#1 and Link#2)
+	 * Link#1 MMRs at start at 0x002000, Link#2 MMRs at 0x003000
+	 * NOTE: the comment offsets at far right: let 'z' = {2 or 3}
+	 */
+	#define ce_lsi(link_num)	ce_lsi[link_num-1]
+	struct ce_lsi_reg {
+		uint64_t	ce_lsi_lpu_id;			/* 0x00z000 */
+		uint64_t	ce_lsi_rst;			/* 0x00z008 */
+		uint64_t	ce_lsi_dbg_stat;		/* 0x00z010 */
+		uint64_t	ce_lsi_dbg_cfg;			/* 0x00z018 */
+		uint64_t	ce_lsi_ltssm_ctrl;		/* 0x00z020 */
+		uint64_t	ce_lsi_lk_stat;			/* 0x00z028 */
+		uint64_t	ce_pad_00z030[2];   /* 0x00z030 -- 0x00z038 */
+		uint64_t	ce_lsi_int_and_stat;		/* 0x00z040 */
+		uint64_t	ce_lsi_int_mask;		/* 0x00z048 */
+		uint64_t	ce_pad_00z050[22];  /* 0x00z050 -- 0x00z0F8 */
+		uint64_t	ce_lsi_lk_perf_cnt_sel;		/* 0x00z100 */
+		uint64_t	ce_pad_00z108;			/* 0x00z108 */
+		uint64_t	ce_lsi_lk_perf_cnt_ctrl;	/* 0x00z110 */
+		uint64_t	ce_pad_00z118;			/* 0x00z118 */
+		uint64_t	ce_lsi_lk_perf_cnt1;		/* 0x00z120 */
+		uint64_t	ce_lsi_lk_perf_cnt1_test;	/* 0x00z128 */
+		uint64_t	ce_lsi_lk_perf_cnt2;		/* 0x00z130 */
+		uint64_t	ce_lsi_lk_perf_cnt2_test;	/* 0x00z138 */
+		uint64_t	ce_pad_00z140[24];  /* 0x00z140 -- 0x00z1F8 */
+		uint64_t	ce_lsi_lk_lyr_cfg;		/* 0x00z200 */
+		uint64_t	ce_lsi_lk_lyr_status;		/* 0x00z208 */
+		uint64_t	ce_lsi_lk_lyr_int_stat;		/* 0x00z210 */
+		uint64_t	ce_lsi_lk_ly_int_stat_test;	/* 0x00z218 */
+		uint64_t	ce_lsi_lk_ly_int_stat_mask;	/* 0x00z220 */
+		uint64_t	ce_pad_00z228[3];   /* 0x00z228 -- 0x00z238 */
+		uint64_t	ce_lsi_fc_upd_ctl;		/* 0x00z240 */
+		uint64_t	ce_pad_00z248[3];   /* 0x00z248 -- 0x00z258 */
+		uint64_t	ce_lsi_flw_ctl_upd_to_timer;	/* 0x00z260 */
+		uint64_t	ce_lsi_flw_ctl_upd_timer0;	/* 0x00z268 */
+		uint64_t	ce_lsi_flw_ctl_upd_timer1;	/* 0x00z270 */
+		uint64_t	ce_pad_00z278[49];  /* 0x00z278 -- 0x00z3F8 */
+		uint64_t	ce_lsi_freq_nak_lat_thrsh;	/* 0x00z400 */
+		uint64_t	ce_lsi_ack_nak_lat_tmr;		/* 0x00z408 */
+		uint64_t	ce_lsi_rply_tmr_thr;		/* 0x00z410 */
+		uint64_t	ce_lsi_rply_tmr;		/* 0x00z418 */
+		uint64_t	ce_lsi_rply_num_stat;		/* 0x00z420 */
+		uint64_t	ce_lsi_rty_buf_max_addr;	/* 0x00z428 */
+		uint64_t	ce_lsi_rty_fifo_ptr;		/* 0x00z430 */
+		uint64_t	ce_lsi_rty_fifo_rd_wr_ptr;	/* 0x00z438 */
+		uint64_t	ce_lsi_rty_fifo_cred;		/* 0x00z440 */
+		uint64_t	ce_lsi_seq_cnt;			/* 0x00z448 */
+		uint64_t	ce_lsi_ack_sent_seq_num;	/* 0x00z450 */
+		uint64_t	ce_lsi_seq_cnt_fifo_max_addr;	/* 0x00z458 */
+		uint64_t	ce_lsi_seq_cnt_fifo_ptr;	/* 0x00z460 */
+		uint64_t	ce_lsi_seq_cnt_rd_wr_ptr;	/* 0x00z468 */
+		uint64_t	ce_lsi_tx_lk_ts_ctl;		/* 0x00z470 */
+		uint64_t	ce_pad_00z478;			/* 0x00z478 */
+		uint64_t	ce_lsi_mem_addr_ctl;		/* 0x00z480 */
+		uint64_t	ce_lsi_mem_d_ld0;		/* 0x00z488 */
+		uint64_t	ce_lsi_mem_d_ld1;		/* 0x00z490 */
+		uint64_t	ce_lsi_mem_d_ld2;		/* 0x00z498 */
+		uint64_t	ce_lsi_mem_d_ld3;		/* 0x00z4A0 */
+		uint64_t	ce_lsi_mem_d_ld4;		/* 0x00z4A8 */
+		uint64_t	ce_pad_00z4B0[2];   /* 0x00z4B0 -- 0x00z4B8 */
+		uint64_t	ce_lsi_rty_d_cnt;		/* 0x00z4C0 */
+		uint64_t	ce_lsi_seq_buf_cnt;		/* 0x00z4C8 */
+		uint64_t	ce_lsi_seq_buf_bt_d;		/* 0x00z4D0 */
+		uint64_t	ce_pad_00z4D8;			/* 0x00z4D8 */
+		uint64_t	ce_lsi_ack_lat_thr;		/* 0x00z4E0 */
+		uint64_t	ce_pad_00z4E8[3];   /* 0x00z4E8 -- 0x00z4F8 */
+		uint64_t	ce_lsi_nxt_rcv_seq_1_cntr;	/* 0x00z500 */
+		uint64_t	ce_lsi_unsp_dllp_rcvd;		/* 0x00z508 */
+		uint64_t	ce_lsi_rcv_lk_ts_ctl;		/* 0x00z510 */
+		uint64_t	ce_pad_00z518[29];  /* 0x00z518 -- 0x00z5F8 */
+		uint64_t	ce_lsi_phy_lyr_cfg;		/* 0x00z600 */
+		uint64_t	ce_pad_00z608;			/* 0x00z608 */
+		uint64_t	ce_lsi_phy_lyr_int_stat;	/* 0x00z610 */
+		uint64_t	ce_lsi_phy_lyr_int_stat_test;	/* 0x00z618 */
+		uint64_t	ce_lsi_phy_lyr_int_mask;	/* 0x00z620 */
+		uint64_t	ce_pad_00z628[11];  /* 0x00z628 -- 0x00z678 */
+		uint64_t	ce_lsi_rcv_phy_cfg;		/* 0x00z680 */
+		uint64_t	ce_lsi_rcv_phy_stat1;		/* 0x00z688 */
+		uint64_t	ce_lsi_rcv_phy_stat2;		/* 0x00z690 */
+		uint64_t	ce_lsi_rcv_phy_stat3;		/* 0x00z698 */
+		uint64_t	ce_lsi_rcv_phy_int_stat;	/* 0x00z6A0 */
+		uint64_t	ce_lsi_rcv_phy_int_stat_test;	/* 0x00z6A8 */
+		uint64_t	ce_lsi_rcv_phy_int_mask;	/* 0x00z6B0 */
+		uint64_t	ce_pad_00z6B8[9];   /* 0x00z6B8 -- 0x00z6F8 */
+		uint64_t	ce_lsi_tx_phy_cfg;		/* 0x00z700 */
+		uint64_t	ce_lsi_tx_phy_stat;		/* 0x00z708 */
+		uint64_t	ce_lsi_tx_phy_int_stat;		/* 0x00z710 */
+		uint64_t	ce_lsi_tx_phy_int_stat_test;	/* 0x00z718 */
+		uint64_t	ce_lsi_tx_phy_int_mask;		/* 0x00z720 */
+		uint64_t	ce_lsi_tx_phy_stat2;		/* 0x00z728 */
+		uint64_t	ce_pad_00z730[10];  /* 0x00z730 -- 0x00z77F */
+		uint64_t	ce_lsi_ltssm_cfg1;		/* 0x00z780 */
+		uint64_t	ce_lsi_ltssm_cfg2;		/* 0x00z788 */
+		uint64_t	ce_lsi_ltssm_cfg3;		/* 0x00z790 */
+		uint64_t	ce_lsi_ltssm_cfg4;		/* 0x00z798 */
+		uint64_t	ce_lsi_ltssm_cfg5;		/* 0x00z7A0 */
+		uint64_t	ce_lsi_ltssm_stat1;		/* 0x00z7A8 */
+		uint64_t	ce_lsi_ltssm_stat2;		/* 0x00z7B0 */
+		uint64_t	ce_lsi_ltssm_int_stat;		/* 0x00z7B8 */
+		uint64_t	ce_lsi_ltssm_int_stat_test;	/* 0x00z7C0 */
+		uint64_t	ce_lsi_ltssm_int_mask;		/* 0x00z7C8 */
+		uint64_t	ce_lsi_ltssm_stat_wr_en;	/* 0x00z7D0 */
+		uint64_t	ce_pad_00z7D8[5];   /* 0x00z7D8 -- 0x00z7F8 */
+		uint64_t	ce_lsi_gb_cfg1;			/* 0x00z800 */
+		uint64_t	ce_lsi_gb_cfg2;			/* 0x00z808 */
+		uint64_t	ce_lsi_gb_cfg3;			/* 0x00z810 */
+		uint64_t	ce_lsi_gb_cfg4;			/* 0x00z818 */
+		uint64_t	ce_lsi_gb_stat;			/* 0x00z820 */
+		uint64_t	ce_lsi_gb_int_stat;		/* 0x00z828 */
+		uint64_t	ce_lsi_gb_int_stat_test;	/* 0x00z830 */
+		uint64_t	ce_lsi_gb_int_mask;		/* 0x00z838 */
+		uint64_t	ce_lsi_gb_pwr_dn1;		/* 0x00z840 */
+		uint64_t	ce_lsi_gb_pwr_dn2;		/* 0x00z848 */
+		uint64_t	ce_pad_00z850[246]; /* 0x00z850 -- 0x00zFF8 */
+	} ce_lsi[2];
+
+	uint64_t	ce_pad_004000[10];	    /* 0x004000 -- 0x004048 */
+
+	/*
+	 * CRM: Coretalk Receive Module Registers
+	 */
+	uint64_t	ce_crm_debug_mux;			/* 0x004050 */
+	uint64_t	ce_pad_004058;				/* 0x004058 */
+	uint64_t	ce_crm_ssp_err_cmd_wrd;			/* 0x004060 */
+	uint64_t	ce_crm_ssp_err_addr;			/* 0x004068 */
+	uint64_t	ce_crm_ssp_err_syn;			/* 0x004070 */
+
+	uint64_t	ce_pad_004078[499];	    /* 0x004078 -- 0x005008 */
+
+	/*
+         * CXM: Coretalk Xmit Module Registers
+         */
+	uint64_t	ce_cxm_dyn_credit_status;		/* 0x005010 */
+	uint64_t	ce_cxm_last_credit_status;		/* 0x005018 */
+	uint64_t	ce_cxm_credit_limit;			/* 0x005020 */
+	uint64_t	ce_cxm_force_credit;			/* 0x005028 */
+	uint64_t	ce_cxm_disable_bypass;			/* 0x005030 */
+	uint64_t	ce_pad_005038[3];	    /* 0x005038 -- 0x005048 */
+	uint64_t	ce_cxm_debug_mux;			/* 0x005050 */
+
+        uint64_t        ce_pad_005058[501];         /* 0x005058 -- 0x005FF8 */
+
+	/*
+	 * DTL: Downstream Transaction Layer Regs (Link#1 and Link#2)
+	 * DTL: Link#1 MMRs at start at 0x006000, Link#2 MMRs at 0x008000
+	 * DTL: the comment offsets at far right: let 'y' = {6 or 8}
+	 *
+	 * UTL: Downstream Transaction Layer Regs (Link#1 and Link#2)
+	 * UTL: Link#1 MMRs at start at 0x007000, Link#2 MMRs at 0x009000
+	 * UTL: the comment offsets at far right: let 'z' = {7 or 9}
+	 */
+	#define ce_dtl(link_num)	ce_dtl_utl[link_num-1]
+	#define ce_utl(link_num)	ce_dtl_utl[link_num-1]
+	struct ce_dtl_utl_reg {
+		/* DTL */
+		uint64_t	ce_dtl_dtdr_credit_limit;	/* 0x00y000 */
+		uint64_t	ce_dtl_dtdr_credit_force;	/* 0x00y008 */
+		uint64_t	ce_dtl_dyn_credit_status;	/* 0x00y010 */
+		uint64_t	ce_dtl_dtl_last_credit_stat;	/* 0x00y018 */
+		uint64_t	ce_dtl_dtl_ctrl;		/* 0x00y020 */
+		uint64_t	ce_pad_00y028[5];   /* 0x00y028 -- 0x00y048 */
+		uint64_t	ce_dtl_debug_sel;		/* 0x00y050 */
+		uint64_t	ce_pad_00y058[501]; /* 0x00y058 -- 0x00yFF8 */
+
+		/* UTL */
+		uint64_t	ce_utl_utl_ctrl;		/* 0x00z000 */
+		uint64_t	ce_utl_debug_sel;		/* 0x00z008 */
+		uint64_t	ce_pad_00z010[510]; /* 0x00z010 -- 0x00zFF8 */
+	} ce_dtl_utl[2];
+
+	uint64_t	ce_pad_00A000[514];	    /* 0x00A000 -- 0x00B008 */
+
+	/*
+	 * URE: Upstream Request Engine
+         */
+	uint64_t	ce_ure_dyn_credit_status;		/* 0x00B010 */
+	uint64_t	ce_ure_last_credit_status;		/* 0x00B018 */
+	uint64_t	ce_ure_credit_limit;			/* 0x00B020 */
+	uint64_t	ce_pad_00B028;				/* 0x00B028 */
+	uint64_t	ce_ure_control;				/* 0x00B030 */
+	uint64_t	ce_ure_status;				/* 0x00B038 */
+	uint64_t	ce_pad_00B040[2];	    /* 0x00B040 -- 0x00B048 */
+	uint64_t	ce_ure_debug_sel;			/* 0x00B050 */
+	uint64_t	ce_ure_pcie_debug_sel;			/* 0x00B058 */
+	uint64_t	ce_ure_ssp_err_cmd_wrd;			/* 0x00B060 */
+	uint64_t	ce_ure_ssp_err_addr;			/* 0x00B068 */
+	uint64_t	ce_ure_page_map;			/* 0x00B070 */
+	uint64_t	ce_ure_dir_map[TIOCE_NUM_PORTS];	/* 0x00B078 */
+	uint64_t	ce_ure_pipe_sel1;			/* 0x00B088 */
+	uint64_t	ce_ure_pipe_mask1;			/* 0x00B090 */
+	uint64_t	ce_ure_pipe_sel2;			/* 0x00B098 */
+	uint64_t	ce_ure_pipe_mask2;			/* 0x00B0A0 */
+	uint64_t	ce_ure_pcie1_credits_sent;		/* 0x00B0A8 */
+	uint64_t	ce_ure_pcie1_credits_used;		/* 0x00B0B0 */
+	uint64_t	ce_ure_pcie1_credit_limit;		/* 0x00B0B8 */
+	uint64_t	ce_ure_pcie2_credits_sent;		/* 0x00B0C0 */
+	uint64_t	ce_ure_pcie2_credits_used;		/* 0x00B0C8 */
+	uint64_t	ce_ure_pcie2_credit_limit;		/* 0x00B0D0 */
+	uint64_t	ce_ure_pcie_force_credit;		/* 0x00B0D8 */
+	uint64_t	ce_ure_rd_tnum_val;			/* 0x00B0E0 */
+	uint64_t	ce_ure_rd_tnum_rsp_rcvd;		/* 0x00B0E8 */
+	uint64_t	ce_ure_rd_tnum_esent_timer;		/* 0x00B0F0 */
+	uint64_t	ce_ure_rd_tnum_error;			/* 0x00B0F8 */
+	uint64_t	ce_ure_rd_tnum_first_cl;		/* 0x00B100 */
+	uint64_t	ce_ure_rd_tnum_link_buf;		/* 0x00B108 */
+	uint64_t	ce_ure_wr_tnum_val;			/* 0x00B110 */
+	uint64_t	ce_ure_sram_err_addr0;			/* 0x00B118 */
+	uint64_t	ce_ure_sram_err_addr1;			/* 0x00B120 */
+	uint64_t	ce_ure_sram_err_addr2;			/* 0x00B128 */
+	uint64_t	ce_ure_sram_rd_addr0;			/* 0x00B130 */
+	uint64_t	ce_ure_sram_rd_addr1;			/* 0x00B138 */
+	uint64_t	ce_ure_sram_rd_addr2;			/* 0x00B140 */
+	uint64_t	ce_ure_sram_wr_addr0;			/* 0x00B148 */
+	uint64_t	ce_ure_sram_wr_addr1;			/* 0x00B150 */
+	uint64_t	ce_ure_sram_wr_addr2;			/* 0x00B158 */
+	uint64_t	ce_ure_buf_flush10;			/* 0x00B160 */
+	uint64_t	ce_ure_buf_flush11;			/* 0x00B168 */
+	uint64_t	ce_ure_buf_flush12;			/* 0x00B170 */
+	uint64_t	ce_ure_buf_flush13;			/* 0x00B178 */
+	uint64_t	ce_ure_buf_flush20;			/* 0x00B180 */
+	uint64_t	ce_ure_buf_flush21;			/* 0x00B188 */
+	uint64_t	ce_ure_buf_flush22;			/* 0x00B190 */
+	uint64_t	ce_ure_buf_flush23;			/* 0x00B198 */
+	uint64_t	ce_ure_pcie_control1;			/* 0x00B1A0 */
+	uint64_t	ce_ure_pcie_control2;			/* 0x00B1A8 */
+
+	uint64_t	ce_pad_00B1B0[458];	    /* 0x00B1B0 -- 0x00BFF8 */
+
+	/* Upstream Data Buffer, Port1 */
+	struct ce_ure_maint_ups_dat1_data {
+		uint64_t	data63_0[512];	    /* 0x00C000 -- 0x00CFF8 */
+		uint64_t	data127_64[512];    /* 0x00D000 -- 0x00DFF8 */
+		uint64_t	parity[512];	    /* 0x00E000 -- 0x00EFF8 */
+	} ce_ure_maint_ups_dat1;
+
+	/* Upstream Header Buffer, Port1 */
+	struct ce_ure_maint_ups_hdr1_data {
+		uint64_t	data63_0[512];	    /* 0x00F000 -- 0x00FFF8 */
+		uint64_t	data127_64[512];    /* 0x010000 -- 0x010FF8 */
+		uint64_t	parity[512];	    /* 0x011000 -- 0x011FF8 */
+	} ce_ure_maint_ups_hdr1;
+
+	/* Upstream Data Buffer, Port2 */
+	struct ce_ure_maint_ups_dat2_data {
+		uint64_t	data63_0[512];	    /* 0x012000 -- 0x012FF8 */
+		uint64_t	data127_64[512];    /* 0x013000 -- 0x013FF8 */
+		uint64_t	parity[512];	    /* 0x014000 -- 0x014FF8 */
+	} ce_ure_maint_ups_dat2;
+
+	/* Upstream Header Buffer, Port2 */
+	struct ce_ure_maint_ups_hdr2_data {
+		uint64_t	data63_0[512];	    /* 0x015000 -- 0x015FF8 */
+		uint64_t	data127_64[512];    /* 0x016000 -- 0x016FF8 */
+		uint64_t	parity[512];	    /* 0x017000 -- 0x017FF8 */
+	} ce_ure_maint_ups_hdr2;
+
+	/* Downstream Data Buffer */
+	struct ce_ure_maint_dns_dat_data {
+		uint64_t	data63_0[512];	    /* 0x018000 -- 0x018FF8 */
+		uint64_t	data127_64[512];    /* 0x019000 -- 0x019FF8 */
+		uint64_t	parity[512];	    /* 0x01A000 -- 0x01AFF8 */
+	} ce_ure_maint_dns_dat;
+
+	/* Downstream Header Buffer */
+	struct	ce_ure_maint_dns_hdr_data {
+		uint64_t	data31_0[64];	    /* 0x01B000 -- 0x01B1F8 */
+		uint64_t	data95_32[64];	    /* 0x01B200 -- 0x01B3F8 */
+		uint64_t	parity[64];	    /* 0x01B400 -- 0x01B5F8 */
+	} ce_ure_maint_dns_hdr;
+
+	/* RCI Buffer Data */
+	struct	ce_ure_maint_rci_data {
+		uint64_t	data41_0[64];	    /* 0x01B600 -- 0x01B7F8 */
+		uint64_t	data69_42[64];	    /* 0x01B800 -- 0x01B9F8 */
+	} ce_ure_maint_rci;
+
+	/* Response Queue */
+	uint64_t	ce_ure_maint_rspq[64];	    /* 0x01BA00 -- 0x01BBF8 */
+
+	uint64_t	ce_pad_01C000[4224];	    /* 0x01BC00 -- 0x023FF8 */
+
+	/* Admin Build-a-Packet Buffer */
+	struct	ce_adm_maint_bap_buf_data {
+		uint64_t	data63_0[258];	    /* 0x024000 -- 0x024808 */
+		uint64_t	data127_64[258];    /* 0x024810 -- 0x025018 */
+		uint64_t	parity[258];	    /* 0x025020 -- 0x025828 */
+	} ce_adm_maint_bap_buf;
+
+	uint64_t	ce_pad_025830[5370];	    /* 0x025830 -- 0x02FFF8 */
+
+	/* URE: 40bit PMU ATE Buffer */		    /* 0x030000 -- 0x037FF8 */
+	uint64_t	ce_ure_ate40[TIOCE_NUM_M40_ATES];
+
+	/* URE: 32/40bit PMU ATE Buffer */	    /* 0x038000 -- 0x03BFF8 */
+	uint64_t	ce_ure_ate3240[TIOCE_NUM_M3240_ATES];
+
+	uint64_t	ce_pad_03C000[2050];	    /* 0x03C000 -- 0x040008 */
+
+	/*
+	 * DRE: Down Stream Request Engine
+         */
+	uint64_t	ce_dre_dyn_credit_status1;		/* 0x040010 */
+	uint64_t	ce_dre_dyn_credit_status2;		/* 0x040018 */
+	uint64_t	ce_dre_last_credit_status1;		/* 0x040020 */
+	uint64_t	ce_dre_last_credit_status2;		/* 0x040028 */
+	uint64_t	ce_dre_credit_limit1;			/* 0x040030 */
+	uint64_t	ce_dre_credit_limit2;			/* 0x040038 */
+	uint64_t	ce_dre_force_credit1;			/* 0x040040 */
+	uint64_t	ce_dre_force_credit2;			/* 0x040048 */
+	uint64_t	ce_dre_debug_mux1;			/* 0x040050 */
+	uint64_t	ce_dre_debug_mux2;			/* 0x040058 */
+	uint64_t	ce_dre_ssp_err_cmd_wrd;			/* 0x040060 */
+	uint64_t	ce_dre_ssp_err_addr;			/* 0x040068 */
+	uint64_t	ce_dre_comp_err_cmd_wrd;		/* 0x040070 */
+	uint64_t	ce_dre_comp_err_addr;			/* 0x040078 */
+	uint64_t	ce_dre_req_status;			/* 0x040080 */
+	uint64_t	ce_dre_config1;				/* 0x040088 */
+	uint64_t	ce_dre_config2;				/* 0x040090 */
+	uint64_t	ce_dre_config_req_status;		/* 0x040098 */
+	uint64_t	ce_pad_0400A0[12];	    /* 0x0400A0 -- 0x0400F8 */
+	uint64_t	ce_dre_dyn_fifo;			/* 0x040100 */
+	uint64_t	ce_pad_040108[3];	    /* 0x040108 -- 0x040118 */
+	uint64_t	ce_dre_last_fifo;			/* 0x040120 */
+
+	uint64_t	ce_pad_040128[27];	    /* 0x040128 -- 0x0401F8 */
+
+	/* DRE Downstream Head Queue */
+	struct	ce_dre_maint_ds_head_queue {
+		uint64_t	data63_0[32];	    /* 0x040200 -- 0x0402F8 */
+		uint64_t	data127_64[32];	    /* 0x040300 -- 0x0403F8 */
+		uint64_t	parity[32];	    /* 0x040400 -- 0x0404F8 */
+	} ce_dre_maint_ds_head_q;
+
+	uint64_t	ce_pad_040500[352];	    /* 0x040500 -- 0x040FF8 */
+
+	/* DRE Downstream Data Queue */
+	struct	ce_dre_maint_ds_data_queue {
+		uint64_t	data63_0[256];	    /* 0x041000 -- 0x0417F8 */
+		uint64_t	ce_pad_041800[256]; /* 0x041800 -- 0x041FF8 */
+		uint64_t	data127_64[256];    /* 0x042000 -- 0x0427F8 */
+		uint64_t	ce_pad_042800[256]; /* 0x042800 -- 0x042FF8 */
+		uint64_t	parity[256];	    /* 0x043000 -- 0x0437F8 */
+		uint64_t	ce_pad_043800[256]; /* 0x043800 -- 0x043FF8 */
+	} ce_dre_maint_ds_data_q;
+
+	/* DRE URE Upstream Response Queue */
+	struct	ce_dre_maint_ure_us_rsp_queue {
+		uint64_t	data63_0[8];	    /* 0x044000 -- 0x044038 */
+		uint64_t	ce_pad_044040[24];  /* 0x044040 -- 0x0440F8 */
+		uint64_t	data127_64[8];      /* 0x044100 -- 0x044138 */
+		uint64_t	ce_pad_044140[24];  /* 0x044140 -- 0x0441F8 */
+		uint64_t	parity[8];	    /* 0x044200 -- 0x044238 */
+		uint64_t	ce_pad_044240[24];  /* 0x044240 -- 0x0442F8 */
+	} ce_dre_maint_ure_us_rsp_q;
+
+	uint64_t 	ce_dre_maint_us_wrt_rsp[32];/* 0x044300 -- 0x0443F8 */
+
+	uint64_t	ce_end_of_struct;			/* 0x044400 */
+} tioce_t;
+
+
+/* ce_adm_int_mask/ce_adm_int_status register bit defines */
+#define CE_ADM_INT_CE_ERROR_SHFT		0
+#define CE_ADM_INT_LSI1_IP_ERROR_SHFT		1
+#define CE_ADM_INT_LSI2_IP_ERROR_SHFT		2
+#define CE_ADM_INT_PCIE_ERROR_SHFT		3
+#define CE_ADM_INT_PORT1_HOTPLUG_EVENT_SHFT	4
+#define CE_ADM_INT_PORT2_HOTPLUG_EVENT_SHFT	5
+#define CE_ADM_INT_PCIE_PORT1_DEV_A_SHFT	6
+#define CE_ADM_INT_PCIE_PORT1_DEV_B_SHFT	7
+#define CE_ADM_INT_PCIE_PORT1_DEV_C_SHFT	8
+#define CE_ADM_INT_PCIE_PORT1_DEV_D_SHFT	9
+#define CE_ADM_INT_PCIE_PORT2_DEV_A_SHFT	10
+#define CE_ADM_INT_PCIE_PORT2_DEV_B_SHFT	11
+#define CE_ADM_INT_PCIE_PORT2_DEV_C_SHFT	12
+#define CE_ADM_INT_PCIE_PORT2_DEV_D_SHFT	13
+#define CE_ADM_INT_PCIE_MSG_SHFT		14 /*see int_dest_14*/
+#define CE_ADM_INT_PCIE_MSG_SLOT_0_SHFT		14
+#define CE_ADM_INT_PCIE_MSG_SLOT_1_SHFT		15
+#define CE_ADM_INT_PCIE_MSG_SLOT_2_SHFT		16
+#define CE_ADM_INT_PCIE_MSG_SLOT_3_SHFT		17
+#define CE_ADM_INT_PORT1_PM_PME_MSG_SHFT	22
+#define CE_ADM_INT_PORT2_PM_PME_MSG_SHFT	23
+
+/* ce_adm_force_int register bit defines */
+#define CE_ADM_FORCE_INT_PCIE_PORT1_DEV_A_SHFT	0
+#define CE_ADM_FORCE_INT_PCIE_PORT1_DEV_B_SHFT	1
+#define CE_ADM_FORCE_INT_PCIE_PORT1_DEV_C_SHFT	2
+#define CE_ADM_FORCE_INT_PCIE_PORT1_DEV_D_SHFT	3
+#define CE_ADM_FORCE_INT_PCIE_PORT2_DEV_A_SHFT	4
+#define CE_ADM_FORCE_INT_PCIE_PORT2_DEV_B_SHFT	5
+#define CE_ADM_FORCE_INT_PCIE_PORT2_DEV_C_SHFT	6
+#define CE_ADM_FORCE_INT_PCIE_PORT2_DEV_D_SHFT	7
+#define CE_ADM_FORCE_INT_ALWAYS_SHFT		8
+
+/* ce_adm_int_dest register bit masks & shifts */
+#define INTR_VECTOR_SHFT			56
+
+/* ce_adm_error_mask and ce_adm_error_summary register bit masks */
+#define CE_ADM_ERR_CRM_SSP_REQ_INVALID			(0x1ULL <<  0)
+#define CE_ADM_ERR_SSP_REQ_HEADER			(0x1ULL <<  1)
+#define CE_ADM_ERR_SSP_RSP_HEADER			(0x1ULL <<  2)
+#define CE_ADM_ERR_SSP_PROTOCOL_ERROR			(0x1ULL <<  3)
+#define CE_ADM_ERR_SSP_SBE				(0x1ULL <<  4)
+#define CE_ADM_ERR_SSP_MBE				(0x1ULL <<  5)
+#define CE_ADM_ERR_CXM_CREDIT_OFLOW			(0x1ULL <<  6)
+#define CE_ADM_ERR_DRE_SSP_REQ_INVAL			(0x1ULL <<  7)
+#define CE_ADM_ERR_SSP_REQ_LONG				(0x1ULL <<  8)
+#define CE_ADM_ERR_SSP_REQ_OFLOW			(0x1ULL <<  9)
+#define CE_ADM_ERR_SSP_REQ_SHORT			(0x1ULL << 10)
+#define CE_ADM_ERR_SSP_REQ_SIDEBAND			(0x1ULL << 11)
+#define CE_ADM_ERR_SSP_REQ_ADDR_ERR			(0x1ULL << 12)
+#define CE_ADM_ERR_SSP_REQ_BAD_BE			(0x1ULL << 13)
+#define CE_ADM_ERR_PCIE_COMPL_TIMEOUT			(0x1ULL << 14)
+#define CE_ADM_ERR_PCIE_UNEXP_COMPL			(0x1ULL << 15)
+#define CE_ADM_ERR_PCIE_ERR_COMPL			(0x1ULL << 16)
+#define CE_ADM_ERR_DRE_CREDIT_OFLOW			(0x1ULL << 17)
+#define CE_ADM_ERR_DRE_SRAM_PE				(0x1ULL << 18)
+#define CE_ADM_ERR_SSP_RSP_INVALID			(0x1ULL << 19)
+#define CE_ADM_ERR_SSP_RSP_LONG				(0x1ULL << 20)
+#define CE_ADM_ERR_SSP_RSP_SHORT			(0x1ULL << 21)
+#define CE_ADM_ERR_SSP_RSP_SIDEBAND			(0x1ULL << 22)
+#define CE_ADM_ERR_URE_SSP_RSP_UNEXP			(0x1ULL << 23)
+#define CE_ADM_ERR_URE_SSP_WR_REQ_TIMEOUT		(0x1ULL << 24)
+#define CE_ADM_ERR_URE_SSP_RD_REQ_TIMEOUT		(0x1ULL << 25)
+#define CE_ADM_ERR_URE_ATE3240_PAGE_FAULT		(0x1ULL << 26)
+#define CE_ADM_ERR_URE_ATE40_PAGE_FAULT			(0x1ULL << 27)
+#define CE_ADM_ERR_URE_CREDIT_OFLOW			(0x1ULL << 28)
+#define CE_ADM_ERR_URE_SRAM_PE				(0x1ULL << 29)
+#define CE_ADM_ERR_ADM_SSP_RSP_UNEXP			(0x1ULL << 30)
+#define CE_ADM_ERR_ADM_SSP_REQ_TIMEOUT			(0x1ULL << 31)
+#define CE_ADM_ERR_MMR_ACCESS_ERROR			(0x1ULL << 32)
+#define CE_ADM_ERR_MMR_ADDR_ERROR			(0x1ULL << 33)
+#define CE_ADM_ERR_ADM_CREDIT_OFLOW			(0x1ULL << 34)
+#define CE_ADM_ERR_ADM_SRAM_PE				(0x1ULL << 35)
+#define CE_ADM_ERR_DTL1_MIN_PDATA_CREDIT_ERR		(0x1ULL << 36)
+#define CE_ADM_ERR_DTL1_INF_COMPL_CRED_UPDT_ERR		(0x1ULL << 37)
+#define CE_ADM_ERR_DTL1_INF_POSTED_CRED_UPDT_ERR	(0x1ULL << 38)
+#define CE_ADM_ERR_DTL1_INF_NPOSTED_CRED_UPDT_ERR	(0x1ULL << 39)
+#define CE_ADM_ERR_DTL1_COMP_HD_CRED_MAX_ERR		(0x1ULL << 40)
+#define CE_ADM_ERR_DTL1_COMP_D_CRED_MAX_ERR		(0x1ULL << 41)
+#define CE_ADM_ERR_DTL1_NPOSTED_HD_CRED_MAX_ERR		(0x1ULL << 42)
+#define CE_ADM_ERR_DTL1_NPOSTED_D_CRED_MAX_ERR		(0x1ULL << 43)
+#define CE_ADM_ERR_DTL1_POSTED_HD_CRED_MAX_ERR		(0x1ULL << 44)
+#define CE_ADM_ERR_DTL1_POSTED_D_CRED_MAX_ERR		(0x1ULL << 45)
+#define CE_ADM_ERR_DTL2_MIN_PDATA_CREDIT_ERR		(0x1ULL << 46)
+#define CE_ADM_ERR_DTL2_INF_COMPL_CRED_UPDT_ERR		(0x1ULL << 47)
+#define CE_ADM_ERR_DTL2_INF_POSTED_CRED_UPDT_ERR	(0x1ULL << 48)
+#define CE_ADM_ERR_DTL2_INF_NPOSTED_CRED_UPDT_ERR	(0x1ULL << 49)
+#define CE_ADM_ERR_DTL2_COMP_HD_CRED_MAX_ERR		(0x1ULL << 50)
+#define CE_ADM_ERR_DTL2_COMP_D_CRED_MAX_ERR		(0x1ULL << 51)
+#define CE_ADM_ERR_DTL2_NPOSTED_HD_CRED_MAX_ERR		(0x1ULL << 52)
+#define CE_ADM_ERR_DTL2_NPOSTED_D_CRED_MAX_ERR		(0x1ULL << 53)
+#define CE_ADM_ERR_DTL2_POSTED_HD_CRED_MAX_ERR		(0x1ULL << 54)
+#define CE_ADM_ERR_DTL2_POSTED_D_CRED_MAX_ERR		(0x1ULL << 55)
+#define CE_ADM_ERR_PORT1_PCIE_COR_ERR			(0x1ULL << 56)
+#define CE_ADM_ERR_PORT1_PCIE_NFAT_ERR			(0x1ULL << 57)
+#define CE_ADM_ERR_PORT1_PCIE_FAT_ERR			(0x1ULL << 58)
+#define CE_ADM_ERR_PORT2_PCIE_COR_ERR			(0x1ULL << 59)
+#define CE_ADM_ERR_PORT2_PCIE_NFAT_ERR			(0x1ULL << 60)
+#define CE_ADM_ERR_PORT2_PCIE_FAT_ERR			(0x1ULL << 61)
+
+/* ce_adm_ure_ups_buf_barrier_flush register bit masks and shifts */
+#define FLUSH_SEL_PORT1_PIPE0_SHFT	0
+#define FLUSH_SEL_PORT1_PIPE1_SHFT	4
+#define FLUSH_SEL_PORT1_PIPE2_SHFT	8
+#define FLUSH_SEL_PORT1_PIPE3_SHFT	12
+#define FLUSH_SEL_PORT2_PIPE0_SHFT	16
+#define FLUSH_SEL_PORT2_PIPE1_SHFT	20
+#define FLUSH_SEL_PORT2_PIPE2_SHFT	24
+#define FLUSH_SEL_PORT2_PIPE3_SHFT	28
+
+/* ce_dre_config1 register bit masks and shifts */
+#define CE_DRE_RO_ENABLE		(0x1ULL << 0)
+#define CE_DRE_DYN_RO_ENABLE		(0x1ULL << 1)
+#define CE_DRE_SUP_CONFIG_COMP_ERROR	(0x1ULL << 2)
+#define CE_DRE_SUP_IO_COMP_ERROR	(0x1ULL << 3)
+#define CE_DRE_ADDR_MODE_SHFT		4
+
+/* ce_dre_config_req_status register bit masks */
+#define CE_DRE_LAST_CONFIG_COMPLETION	(0x7ULL << 0)
+#define CE_DRE_DOWNSTREAM_CONFIG_ERROR	(0x1ULL << 3)
+#define CE_DRE_CONFIG_COMPLETION_VALID	(0x1ULL << 4)
+#define CE_DRE_CONFIG_REQUEST_ACTIVE	(0x1ULL << 5)
+
+/* ce_ure_control register bit masks & shifts */
+#define CE_URE_RD_MRG_ENABLE		(0x1ULL << 0)
+#define CE_URE_WRT_MRG_ENABLE1		(0x1ULL << 4)
+#define CE_URE_WRT_MRG_ENABLE2		(0x1ULL << 5)
+#define CE_URE_RSPQ_BYPASS_DISABLE	(0x1ULL << 24)
+#define CE_URE_UPS_DAT1_PAR_DISABLE	(0x1ULL << 32)
+#define CE_URE_UPS_HDR1_PAR_DISABLE	(0x1ULL << 33)
+#define CE_URE_UPS_DAT2_PAR_DISABLE	(0x1ULL << 34)
+#define CE_URE_UPS_HDR2_PAR_DISABLE	(0x1ULL << 35)
+#define CE_URE_ATE_PAR_DISABLE		(0x1ULL << 36)
+#define CE_URE_RCI_PAR_DISABLE		(0x1ULL << 37)
+#define CE_URE_RSPQ_PAR_DISABLE		(0x1ULL << 38)
+#define CE_URE_DNS_DAT_PAR_DISABLE	(0x1ULL << 39)
+#define CE_URE_DNS_HDR_PAR_DISABLE	(0x1ULL << 40)
+#define CE_URE_MALFORM_DISABLE		(0x1ULL << 44)
+#define CE_URE_UNSUP_DISABLE		(0x1ULL << 45)
+
+/* ce_ure_page_map register bit masks & shifts */
+#define CE_URE_ATE3240_ENABLE		(0x1ULL << 0)
+#define CE_URE_ATE40_ENABLE 		(0x1ULL << 1)
+#define CE_URE_PAGESIZE_SHFT		4
+#define CE_URE_PAGESIZE_MASK		(0x7ULL << CE_URE_PAGESIZE_SHFT)
+#define CE_URE_4K_PAGESIZE		(0x0ULL << CE_URE_PAGESIZE_SHFT)
+#define CE_URE_16K_PAGESIZE		(0x1ULL << CE_URE_PAGESIZE_SHFT)
+#define CE_URE_64K_PAGESIZE		(0x2ULL << CE_URE_PAGESIZE_SHFT)
+#define CE_URE_128K_PAGESIZE		(0x3ULL << CE_URE_PAGESIZE_SHFT)
+#define CE_URE_256K_PAGESIZE		(0x4ULL << CE_URE_PAGESIZE_SHFT)
+
+/* ce_ure_pipe_sel register bit masks & shifts */
+#define PKT_TRAFIC_SHRT			16
+#define BUS_SRC_ID_SHFT			8
+#define DEV_SRC_ID_SHFT			3
+#define FNC_SRC_ID_SHFT			0
+#define CE_URE_TC_MASK			(0x07ULL << PKT_TRAFIC_SHRT)
+#define CE_URE_BUS_MASK			(0xFFULL << BUS_SRC_ID_SHFT)
+#define CE_URE_DEV_MASK			(0x1FULL << DEV_SRC_ID_SHFT)
+#define CE_URE_FNC_MASK			(0x07ULL << FNC_SRC_ID_SHFT)
+#define CE_URE_PIPE_BUS(b)		(((uint64_t)(b) << BUS_SRC_ID_SHFT) & \
+					 CE_URE_BUS_MASK)
+#define CE_URE_PIPE_DEV(d)		(((uint64_t)(d) << DEV_SRC_ID_SHFT) & \
+					 CE_URE_DEV_MASK)
+#define CE_URE_PIPE_FNC(f)		(((uint64_t)(f) << FNC_SRC_ID_SHFT) & \
+					 CE_URE_FNC_MASK)
+
+#define CE_URE_SEL1_SHFT		0
+#define CE_URE_SEL2_SHFT		20
+#define CE_URE_SEL3_SHFT		40
+#define CE_URE_SEL1_MASK		(0x7FFFFULL << CE_URE_SEL1_SHFT)
+#define CE_URE_SEL2_MASK		(0x7FFFFULL << CE_URE_SEL2_SHFT)
+#define CE_URE_SEL3_MASK		(0x7FFFFULL << CE_URE_SEL3_SHFT)
+
+
+/* ce_ure_pipe_mask register bit masks & shifts */
+#define CE_URE_MASK1_SHFT		0
+#define CE_URE_MASK2_SHFT		20
+#define CE_URE_MASK3_SHFT		40
+#define CE_URE_MASK1_MASK		(0x7FFFFULL << CE_URE_MASK1_SHFT)
+#define CE_URE_MASK2_MASK		(0x7FFFFULL << CE_URE_MASK2_SHFT)
+#define CE_URE_MASK3_MASK		(0x7FFFFULL << CE_URE_MASK3_SHFT)
+
+
+/* ce_ure_pcie_control1 register bit masks & shifts */
+#define CE_URE_SI			(0x1ULL << 0)
+#define CE_URE_ELAL_SHFT		4
+#define CE_URE_ELAL_MASK		(0x7ULL << CE_URE_ELAL_SHFT)
+#define CE_URE_ELAL1_SHFT		8
+#define CE_URE_ELAL1_MASK		(0x7ULL << CE_URE_ELAL1_SHFT)
+#define CE_URE_SCC			(0x1ULL << 12)
+#define CE_URE_PN1_SHFT			16
+#define CE_URE_PN1_MASK			(0xFFULL << CE_URE_PN1_SHFT)
+#define CE_URE_PN2_SHFT			24
+#define CE_URE_PN2_MASK			(0xFFULL << CE_URE_PN2_SHFT)
+#define CE_URE_PN1_SET(n)		(((uint64_t)(n) << CE_URE_PN1_SHFT) & \
+					 CE_URE_PN1_MASK)
+#define CE_URE_PN2_SET(n)		(((uint64_t)(n) << CE_URE_PN2_SHFT) & \
+					 CE_URE_PN2_MASK)
+
+/* ce_ure_pcie_control2 register bit masks & shifts */
+#define CE_URE_ABP			(0x1ULL << 0)
+#define CE_URE_PCP			(0x1ULL << 1)
+#define CE_URE_MSP			(0x1ULL << 2)
+#define CE_URE_AIP			(0x1ULL << 3)
+#define CE_URE_PIP			(0x1ULL << 4)
+#define CE_URE_HPS			(0x1ULL << 5)
+#define CE_URE_HPC			(0x1ULL << 6)
+#define CE_URE_SPLV_SHFT		7
+#define CE_URE_SPLV_MASK		(0xFFULL << CE_URE_SPLV_SHFT)
+#define CE_URE_SPLS_SHFT		15
+#define CE_URE_SPLS_MASK		(0x3ULL << CE_URE_SPLS_SHFT)
+#define CE_URE_PSN1_SHFT		19
+#define CE_URE_PSN1_MASK		(0x1FFFULL << CE_URE_PSN1_SHFT)
+#define CE_URE_PSN2_SHFT		32
+#define CE_URE_PSN2_MASK		(0x1FFFULL << CE_URE_PSN2_SHFT)
+#define CE_URE_PSN1_SET(n)		(((uint64_t)(n) << CE_URE_PSN1_SHFT) & \
+					 CE_URE_PSN1_MASK)
+#define CE_URE_PSN2_SET(n)		(((uint64_t)(n) << CE_URE_PSN2_SHFT) & \
+					 CE_URE_PSN2_MASK)
+
+/*
+ * PIO address space ranges for CE
+ */
+
+/* Local CE Registers Space */
+#define CE_PIO_MMR			0x00000000
+#define CE_PIO_MMR_LEN			0x04000000
+
+/* PCI Compatible Config Space */
+#define CE_PIO_CONFIG_SPACE		0x04000000
+#define CE_PIO_CONFIG_SPACE_LEN		0x04000000
+
+/* PCI I/O Space Alias */
+#define CE_PIO_IO_SPACE_ALIAS		0x08000000
+#define CE_PIO_IO_SPACE_ALIAS_LEN	0x08000000
+
+/* PCI Enhanced Config Space */
+#define CE_PIO_E_CONFIG_SPACE		0x10000000
+#define CE_PIO_E_CONFIG_SPACE_LEN	0x10000000
+
+/* PCI I/O Space */
+#define CE_PIO_IO_SPACE			0x100000000
+#define CE_PIO_IO_SPACE_LEN		0x100000000
+
+/* PCI MEM Space */
+#define CE_PIO_MEM_SPACE		0x200000000
+#define CE_PIO_MEM_SPACE_LEN		TIO_HWIN_SIZE
+
+
+/*
+ * CE PCI Enhanced Config Space shifts & masks
+ */
+#define CE_E_CONFIG_BUS_SHFT		20
+#define CE_E_CONFIG_BUS_MASK		(0xFF << CE_E_CONFIG_BUS_SHFT)
+#define CE_E_CONFIG_DEVICE_SHFT		15
+#define CE_E_CONFIG_DEVICE_MASK		(0x1F << CE_E_CONFIG_DEVICE_SHFT)
+#define CE_E_CONFIG_FUNC_SHFT		12
+#define CE_E_CONFIG_FUNC_MASK		(0x7  << CE_E_CONFIG_FUNC_SHFT)
+
+#endif /* __ASM_IA64_SN_TIOCE_H__ */
diff --git a/include/asm-ia64/sn/tioce_provider.h b/include/asm-ia64/sn/tioce_provider.h
new file mode 100644
index 000000000000..7f63dec0a79a
--- /dev/null
+++ b/include/asm-ia64/sn/tioce_provider.h
@@ -0,0 +1,66 @@
+/**************************************************************************
+ *             Copyright (C) 2005, Silicon Graphics, Inc.                 *
+ *									  *
+ *  These coded instructions, statements, and computer programs	 contain  *
+ *  unpublished	 proprietary  information of Silicon Graphics, Inc., and  *
+ *  are protected by Federal copyright law.  They  may	not be disclosed  *
+ *  to	third  parties	or copied or duplicated in any form, in whole or  *
+ *  in part, without the prior written consent of Silicon Graphics, Inc.  *
+ *									  *
+ **************************************************************************/
+
+#ifndef _ASM_IA64_SN_CE_PROVIDER_H
+#define _ASM_IA64_SN_CE_PROVIDER_H
+
+#include <asm/sn/pcibus_provider_defs.h>
+#include <asm/sn/tioce.h>
+
+/*
+ * Common TIOCE structure shared between the prom and kernel
+ *
+ * DO NOT CHANGE THIS STRUCT WITHOUT MAKING CORRESPONDING CHANGES TO THE
+ * PROM VERSION.
+ */
+struct tioce_common {
+	struct pcibus_bussoft	ce_pcibus;	/* common pciio header */
+
+	uint32_t		ce_rev;
+	uint64_t		ce_kernel_private;
+	uint64_t		ce_prom_private;
+};
+
+struct tioce_kernel {
+	struct tioce_common	*ce_common;
+	spinlock_t		ce_lock;
+	struct list_head	ce_dmamap_list;
+
+	uint64_t		ce_ate40_shadow[TIOCE_NUM_M40_ATES];
+	uint64_t		ce_ate3240_shadow[TIOCE_NUM_M3240_ATES];
+	uint32_t		ce_ate3240_pagesize;
+
+	uint8_t			ce_port1_secondary;
+
+	/* per-port resources */
+	struct {
+		int 		dirmap_refcnt;
+		uint64_t	dirmap_shadow;
+	} ce_port[TIOCE_NUM_PORTS];
+};
+
+struct tioce_dmamap {
+	struct list_head	ce_dmamap_list;	/* headed by tioce_kernel */
+	uint32_t		refcnt;
+
+	uint64_t		nbytes;		/* # bytes mapped */
+
+	uint64_t		ct_start;	/* coretalk start address */
+	uint64_t		pci_start;	/* bus start address */
+
+	uint64_t		*ate_hw;	/* hw ptr of first ate in map */
+	uint64_t		*ate_shadow;	/* shadow ptr of firat ate */
+	uint16_t		ate_count;	/* # ate's in the map */
+};
+
+extern int tioce_init_provider(void);
+
+#endif  /* __ASM_IA64_SN_CE_PROVIDER_H */
-- 
cgit v1.2.3


From 0aa2c72e59cf1d09a0b321e4e6292af78a51b8b3 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 11 Aug 2005 10:26:00 -0700
Subject: [IA64-SGI] - New SN hardware support - use_alias_space

Use local SHUB alias space when referencing MMRs that are known
to be node local. There is a slight performance benefit & code
simplification.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/irq.c          | 19 +++++++------------
 arch/ia64/sn/pci/pcibr/pcibr_dma.c |  2 +-
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index af061dba246f..607938c288bb 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -5,7 +5,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
  */
 
 #include <linux/irq.h>
@@ -76,16 +76,14 @@ static void sn_enable_irq(unsigned int irq)
 
 static void sn_ack_irq(unsigned int irq)
 {
-	uint64_t event_occurred, mask = 0;
-	int nasid;
+	u64 event_occurred, mask = 0;
 
 	irq = irq & 0xff;
-	nasid = get_nasid();
 	event_occurred =
-	    HUB_L((uint64_t *) GLOBAL_MMR_ADDR(nasid, SH_EVENT_OCCURRED));
+	    HUB_L((u64*)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED));
 	mask = event_occurred & SH_ALL_INT_MASK;
-	HUB_S((uint64_t *) GLOBAL_MMR_ADDR(nasid, SH_EVENT_OCCURRED_ALIAS),
-		 mask);
+	HUB_S((u64*)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS),
+	      mask);
 	__set_bit(irq, (volatile void *)pda->sn_in_service_ivecs);
 
 	move_irq(irq);
@@ -93,15 +91,12 @@ static void sn_ack_irq(unsigned int irq)
 
 static void sn_end_irq(unsigned int irq)
 {
-	int nasid;
 	int ivec;
-	uint64_t event_occurred;
+	u64 event_occurred;
 
 	ivec = irq & 0xff;
 	if (ivec == SGI_UART_VECTOR) {
-		nasid = get_nasid();
-		event_occurred = HUB_L((uint64_t *) GLOBAL_MMR_ADDR
-				       (nasid, SH_EVENT_OCCURRED));
+		event_occurred = HUB_L((u64*)LOCAL_MMR_ADDR (SH_EVENT_OCCURRED));
 		/* If the UART bit is set here, we may have received an
 		 * interrupt from the UART that the driver missed.  To
 		 * make sure, we IPI ourselves to force us to look again.
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index ae455b6b1897..0f254255f6a1 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -297,7 +297,7 @@ void sn_dma_flush(uint64_t addr)
 		 * If CE ever needs the sn_dma_flush mechanism, we will have
 		 * to account for that here and in tioce_bus_fixup().
 	 	 */
-		uint32_t tio_id = REMOTE_HUB_L(nasid, TIO_NODE_ID);
+		uint32_t tio_id = HUB_L(TIO_IOSPACE_ADDR(nasid, TIO_NODE_ID));
 		uint32_t revnum = XWIDGET_PART_REV_NUM(tio_id);
 
 		/* TIOCP BRINGUP WAR (PV907516): Don't write buffer flush reg */
-- 
cgit v1.2.3


From 2fdbb590e4f9b346e5d06cf7f85dcb7a9f2e0a48 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 11 Aug 2005 10:26:00 -0700
Subject: [IA64-SGI] - New SN hardware support - boot_init_shub2

Update the addresses of the pio_write_status_addr so that
they are correct for newer processors. Shub2 did not number
the threads in the order that I had expected.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 7c7fe441d623..ed77715393ef 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -532,8 +532,8 @@ void __init sn_cpu_init(void)
 	 */
 	{
 		u64 pio1[] = {SH1_PIO_WRITE_STATUS_0, 0, SH1_PIO_WRITE_STATUS_1, 0};
-		u64 pio2[] = {SH2_PIO_WRITE_STATUS_0, SH2_PIO_WRITE_STATUS_1,
-			SH2_PIO_WRITE_STATUS_2, SH2_PIO_WRITE_STATUS_3};
+		u64 pio2[] = {SH2_PIO_WRITE_STATUS_0, SH2_PIO_WRITE_STATUS_2,
+			SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3};
 		u64 *pio;
 		pio = is_shub1() ? pio1 : pio2;
 		pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]);
-- 
cgit v1.2.3


From 1007d02160974a46cc4fd3b1737c183c0471b88f Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 11 Aug 2005 10:27:00 -0700
Subject: [IA64-SGI] - New SN hardware support - no_wars

Disable some shub1-specific code when running on systems with shub2.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/sn2/timer_interrupt.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/sn2/timer_interrupt.c b/arch/ia64/sn/kernel/sn2/timer_interrupt.c
index cde7375390b0..adf5db2e2afe 100644
--- a/arch/ia64/sn/kernel/sn2/timer_interrupt.c
+++ b/arch/ia64/sn/kernel/sn2/timer_interrupt.c
@@ -1,7 +1,7 @@
 /*
  *
  *
- * Copyright (c) 2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2005 Silicon Graphics, Inc.  All Rights Reserved.
  * 
  * This program is free software; you can redistribute it and/or modify it 
  * under the terms of version 2 of the GNU General Public License 
@@ -50,14 +50,16 @@ void sn_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 			     LED_CPU_HEARTBEAT, LED_CPU_HEARTBEAT);
 	}
 
-	if (enable_shub_wars_1_1()) {
-		/* Bugfix code for SHUB 1.1 */
-		if (pda->pio_shub_war_cam_addr)
-			*pda->pio_shub_war_cam_addr = 0x8000000000000010UL;
+	if (is_shub1()) {
+		if (enable_shub_wars_1_1()) {
+			/* Bugfix code for SHUB 1.1 */
+			if (pda->pio_shub_war_cam_addr)
+				*pda->pio_shub_war_cam_addr = 0x8000000000000010UL;
+		}
+		if (pda->sn_lb_int_war_ticks == 0)
+			sn_lb_int_war_check();
+		pda->sn_lb_int_war_ticks++;
+		if (pda->sn_lb_int_war_ticks >= SN_LB_INT_WAR_INTERVAL)
+			pda->sn_lb_int_war_ticks = 0;
 	}
-	if (pda->sn_lb_int_war_ticks == 0)
-		sn_lb_int_war_check();
-	pda->sn_lb_int_war_ticks++;
-	if (pda->sn_lb_int_war_ticks >= SN_LB_INT_WAR_INTERVAL)
-		pda->sn_lb_int_war_ticks = 0;
 }
-- 
cgit v1.2.3


From 7e95b9d6e21eda23bac1ff024d465d2072c8996d Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 11 Aug 2005 10:27:00 -0700
Subject: [IA64-SGI] - New SN hardware support - bte_fixes

Change the BTE driver so that it works for both shub1 and
shub2. Most of the changes are related to the number of cores
that use the BTE engine, to the MMR addresses of various
shub registers, and to using the correct processor or network
physical address.

Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/bte.c | 82 +++++++++++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 31 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/bte.c b/arch/ia64/sn/kernel/bte.c
index 647deae9bfcd..7adef84190db 100644
--- a/arch/ia64/sn/kernel/bte.c
+++ b/arch/ia64/sn/kernel/bte.c
@@ -29,16 +29,30 @@
 
 /* two interfaces on two btes */
 #define MAX_INTERFACES_TO_TRY		4
+#define MAX_NODES_TO_TRY		2
 
 static struct bteinfo_s *bte_if_on_node(nasid_t nasid, int interface)
 {
 	nodepda_t *tmp_nodepda;
 
+	if (nasid_to_cnodeid(nasid) == -1)
+		return (struct bteinfo_s *)NULL;;
+
 	tmp_nodepda = NODEPDA(nasid_to_cnodeid(nasid));
 	return &tmp_nodepda->bte_if[interface];
 
 }
 
+static inline void bte_start_transfer(struct bteinfo_s *bte, u64 len, u64 mode)
+{
+	if (is_shub2()) {
+		BTE_CTRL_STORE(bte, (IBLS_BUSY | ((len) | (mode) << 24)));
+	} else {
+		BTE_LNSTAT_STORE(bte, len);
+		BTE_CTRL_STORE(bte, mode);
+	}
+}
+
 /************************************************************************
  * Block Transfer Engine copy related functions.
  *
@@ -67,13 +81,15 @@ bte_result_t bte_copy(u64 src, u64 dest, u64 len, u64 mode, void *notification)
 {
 	u64 transfer_size;
 	u64 transfer_stat;
+	u64 notif_phys_addr;
 	struct bteinfo_s *bte;
 	bte_result_t bte_status;
 	unsigned long irq_flags;
 	unsigned long itc_end = 0;
-	struct bteinfo_s *btes_to_try[MAX_INTERFACES_TO_TRY];
-	int bte_if_index;
-	int bte_pri, bte_sec;
+	int nasid_to_try[MAX_NODES_TO_TRY];
+	int my_nasid = get_nasid();
+	int bte_if_index, nasid_index;
+	int bte_first, btes_per_node = BTES_PER_NODE;
 
 	BTE_PRINTK(("bte_copy(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%p)\n",
 		    src, dest, len, mode, notification));
@@ -86,36 +102,26 @@ bte_result_t bte_copy(u64 src, u64 dest, u64 len, u64 mode, void *notification)
 		 (src & L1_CACHE_MASK) || (dest & L1_CACHE_MASK));
 	BUG_ON(!(len < ((BTE_LEN_MASK + 1) << L1_CACHE_SHIFT)));
 
-	/* CPU 0 (per node) tries bte0 first, CPU 1 try bte1 first */
-	if (cpuid_to_subnode(smp_processor_id()) == 0) {
-		bte_pri = 0;
-		bte_sec = 1;
-	} else {
-		bte_pri = 1;
-		bte_sec = 0;
-	}
+	/*
+	 * Start with interface corresponding to cpu number
+	 */
+	bte_first = get_cpu() % btes_per_node;
 
 	if (mode & BTE_USE_DEST) {
 		/* try remote then local */
-		btes_to_try[0] = bte_if_on_node(NASID_GET(dest), bte_pri);
-		btes_to_try[1] = bte_if_on_node(NASID_GET(dest), bte_sec);
+		nasid_to_try[0] = NASID_GET(dest);
 		if (mode & BTE_USE_ANY) {
-			btes_to_try[2] = bte_if_on_node(get_nasid(), bte_pri);
-			btes_to_try[3] = bte_if_on_node(get_nasid(), bte_sec);
+			nasid_to_try[1] = my_nasid;
 		} else {
-			btes_to_try[2] = NULL;
-			btes_to_try[3] = NULL;
+			nasid_to_try[1] = (int)NULL;
 		}
 	} else {
 		/* try local then remote */
-		btes_to_try[0] = bte_if_on_node(get_nasid(), bte_pri);
-		btes_to_try[1] = bte_if_on_node(get_nasid(), bte_sec);
+		nasid_to_try[0] = my_nasid;
 		if (mode & BTE_USE_ANY) {
-			btes_to_try[2] = bte_if_on_node(NASID_GET(dest), bte_pri);
-			btes_to_try[3] = bte_if_on_node(NASID_GET(dest), bte_sec);
+			nasid_to_try[1] = NASID_GET(dest);
 		} else {
-			btes_to_try[2] = NULL;
-			btes_to_try[3] = NULL;
+			nasid_to_try[1] = (int)NULL;
 		}
 	}
 
@@ -123,11 +129,12 @@ retry_bteop:
 	do {
 		local_irq_save(irq_flags);
 
-		bte_if_index = 0;
+		bte_if_index = bte_first;
+		nasid_index = 0;
 
 		/* Attempt to lock one of the BTE interfaces. */
-		while (bte_if_index < MAX_INTERFACES_TO_TRY) {
-			bte = btes_to_try[bte_if_index++];
+		while (nasid_index < MAX_NODES_TO_TRY) {
+			bte = bte_if_on_node(nasid_to_try[nasid_index],bte_if_index);
 
 			if (bte == NULL) {
 				continue;
@@ -143,6 +150,15 @@ retry_bteop:
 					break;
 				}
 			}
+
+			bte_if_index = (bte_if_index + 1) % btes_per_node; /* Next interface */
+			if (bte_if_index == bte_first) {
+				/*
+				 * We've tried all interfaces on this node
+				 */
+				nasid_index++;
+			}
+
 			bte = NULL;
 		}
 
@@ -169,7 +185,13 @@ retry_bteop:
 
 	/* Initialize the notification to a known value. */
 	*bte->most_rcnt_na = BTE_WORD_BUSY;
+	notif_phys_addr = TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na));
 
+	if (is_shub2()) {
+		src = SH2_TIO_PHYS_TO_DMA(src);
+		dest = SH2_TIO_PHYS_TO_DMA(dest);
+		notif_phys_addr = SH2_TIO_PHYS_TO_DMA(notif_phys_addr);
+	}
 	/* Set the source and destination registers */
 	BTE_PRINTKV(("IBSA = 0x%lx)\n", (TO_PHYS(src))));
 	BTE_SRC_STORE(bte, TO_PHYS(src));
@@ -177,14 +199,12 @@ retry_bteop:
 	BTE_DEST_STORE(bte, TO_PHYS(dest));
 
 	/* Set the notification register */
-	BTE_PRINTKV(("IBNA = 0x%lx)\n",
-		     TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na))));
-	BTE_NOTIF_STORE(bte,
-			TO_PHYS(ia64_tpa((unsigned long)bte->most_rcnt_na)));
+	BTE_PRINTKV(("IBNA = 0x%lx)\n", notif_phys_addr));
+	BTE_NOTIF_STORE(bte, notif_phys_addr);
 
 	/* Initiate the transfer */
 	BTE_PRINTK(("IBCT = 0x%lx)\n", BTE_VALID_MODE(mode)));
-	BTE_START_TRANSFER(bte, transfer_size, BTE_VALID_MODE(mode));
+	bte_start_transfer(bte, transfer_size, BTE_VALID_MODE(mode));
 
 	itc_end = ia64_get_itc() + (40000000 * local_cpu_data->cyc_per_usec);
 
-- 
cgit v1.2.3


From 68b9753f47953930cb94de0223c163f289399091 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 11 Aug 2005 10:28:00 -0700
Subject: [IA64-SGI] - New SN hardware support - cpu_relax

Add a few missing calls to "hint @pause".

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/bte.c          | 1 +
 arch/ia64/sn/kernel/huberror.c     | 2 +-
 arch/ia64/sn/pci/pcibr/pcibr_dma.c | 3 ++-
 include/asm-ia64/sn/sn_sal.h       | 3 ++-
 4 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/bte.c b/arch/ia64/sn/kernel/bte.c
index 7adef84190db..b75814efadba 100644
--- a/arch/ia64/sn/kernel/bte.c
+++ b/arch/ia64/sn/kernel/bte.c
@@ -215,6 +215,7 @@ retry_bteop:
 	}
 
 	while ((transfer_stat = *bte->most_rcnt_na) == BTE_WORD_BUSY) {
+		cpu_relax();
 		if (ia64_get_itc() > itc_end) {
 			BTE_PRINTK(("BTE timeout nasid 0x%x bte%d IBLS = 0x%lx na 0x%lx\n",
 				NASID_GET(bte->bte_base_addr), bte->bte_num,
diff --git a/arch/ia64/sn/kernel/huberror.c b/arch/ia64/sn/kernel/huberror.c
index 5c39b43ba3c0..5c5eb01c50f0 100644
--- a/arch/ia64/sn/kernel/huberror.c
+++ b/arch/ia64/sn/kernel/huberror.c
@@ -76,7 +76,7 @@ void hubiio_crb_free(struct hubdev_info *hubdev_info, int crbnum)
 	 */
 	REMOTE_HUB_S(hubdev_info->hdi_nasid, IIO_ICDR, (IIO_ICDR_PND | crbnum));
 	while (REMOTE_HUB_L(hubdev_info->hdi_nasid, IIO_ICDR) & IIO_ICDR_PND)
-		udelay(1);
+		cpu_relax();
 
 }
 
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index 0f254255f6a1..34093476e965 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -317,7 +317,8 @@ void sn_dma_flush(uint64_t addr)
 		*(volatile uint32_t *)(p->sfdl_force_int_addr) = 1;
 
 		/* wait for the interrupt to come back. */
-		while (*(p->sfdl_flush_addr) != 0x10f) ;
+		while (*(p->sfdl_flush_addr) != 0x10f)
+			cpu_relax();
 
 		/* okay, everything is synched up. */
 		spin_unlock_irqrestore((spinlock_t *)&p->sfdl_flush_lock, flags);
diff --git a/include/asm-ia64/sn/sn_sal.h b/include/asm-ia64/sn/sn_sal.h
index 27976d223186..99cb9ed34312 100644
--- a/include/asm-ia64/sn/sn_sal.h
+++ b/include/asm-ia64/sn/sn_sal.h
@@ -749,7 +749,8 @@ ia64_sn_power_down(void)
 {
 	struct ia64_sal_retval ret_stuff;
 	SAL_CALL(ret_stuff, SN_SAL_SYSTEM_POWER_DOWN, 0, 0, 0, 0, 0, 0, 0);
-	while(1);
+	while(1)
+		cpu_relax();
 	/* never returns */
 }
 
-- 
cgit v1.2.3


From 470ceb05d9a2b4d61c19fac615a79e56e8401565 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Thu, 11 Aug 2005 10:28:00 -0700
Subject: [IA64-SGI] - New SN hardware support - ptc_fixes

Shub2 provides a much improved mechanism for issuing internode
TLB purges. Add code to support the newer mechanism. There is also
some debug code (disabled) that is useful for testing.

Collect statistics on the number, type & duration of TLB purges.
This data will be useful for making future improvements in the algorithms.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/setup.c            |   1 +
 arch/ia64/sn/kernel/sn2/ptc_deadlock.S |  13 +-
 arch/ia64/sn/kernel/sn2/sn2_smp.c      | 256 ++++++++++++++++++++++++++++++---
 include/asm-ia64/sn/nodepda.h          |   3 +-
 4 files changed, 245 insertions(+), 28 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index ed77715393ef..6648deb778a6 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -403,6 +403,7 @@ static void __init sn_init_pdas(char **cmdline_p)
 		memset(nodepdaindr[cnode], 0, sizeof(nodepda_t));
 		memset(nodepdaindr[cnode]->phys_cpuid, -1,
 		    sizeof(nodepdaindr[cnode]->phys_cpuid));
+		spin_lock_init(&nodepdaindr[cnode]->ptc_lock);
 	}
 
 	/*
diff --git a/arch/ia64/sn/kernel/sn2/ptc_deadlock.S b/arch/ia64/sn/kernel/sn2/ptc_deadlock.S
index 96cb71d15682..3fa95065a446 100644
--- a/arch/ia64/sn/kernel/sn2/ptc_deadlock.S
+++ b/arch/ia64/sn/kernel/sn2/ptc_deadlock.S
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <asm/types.h>
@@ -11,7 +11,7 @@
 
 #define DEADLOCKBIT	SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_SHFT
 #define WRITECOUNTMASK	SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK
-#define ALIAS_OFFSET	(SH1_PIO_WRITE_STATUS_0_ALIAS-SH1_PIO_WRITE_STATUS_0)
+#define ALIAS_OFFSET	8
 
 
 	.global	sn2_ptc_deadlock_recovery_core
@@ -36,13 +36,15 @@ sn2_ptc_deadlock_recovery_core:
 	extr.u	piowcphy=piowc,0,61;;	// Convert piowc to uncached physical address
 	dep	piowcphy=-1,piowcphy,63,1
 	movl	mask=WRITECOUNTMASK
+	mov	r8=r0
 
 1:
 	add	scr2=ALIAS_OFFSET,piowc	// Address of WRITE_STATUS alias register 
-	mov	scr1=7;;		// Clear DEADLOCK, WRITE_ERROR, MULTI_WRITE_ERROR
-	st8.rel	[scr2]=scr1;;
+	;;
+	ld8.acq	scr1=[scr2];;
 
 5:	ld8.acq	scr1=[piowc];;		// Wait for PIOs to complete.
+	hint	@pause
 	and	scr2=scr1,mask;;	// mask of writecount bits
 	cmp.ne	p6,p0=zeroval,scr2
 (p6)	br.cond.sptk 5b
@@ -57,6 +59,7 @@ sn2_ptc_deadlock_recovery_core:
 	st8.rel [ptc0]=data0		// Write PTC0 & wait for completion.
 
 5:	ld8.acq	scr1=[piowcphy];;	// Wait for PIOs to complete.
+	hint	@pause
 	and	scr2=scr1,mask;;	// mask of writecount bits
 	cmp.ne	p6,p0=zeroval,scr2
 (p6)	br.cond.sptk 5b;;
@@ -67,6 +70,7 @@ sn2_ptc_deadlock_recovery_core:
 (p7)	st8.rel [ptc1]=data1;;		// Now write PTC1.
 
 5:	ld8.acq	scr1=[piowcphy];;	// Wait for PIOs to complete.
+	hint	@pause
 	and	scr2=scr1,mask;;	// mask of writecount bits
 	cmp.ne	p6,p0=zeroval,scr2
 (p6)	br.cond.sptk 5b
@@ -77,6 +81,7 @@ sn2_ptc_deadlock_recovery_core:
 	srlz.i;;
 	////////////// END   PHYSICAL MODE ////////////////////
 
+(p8)	add	r8=1,r8
 (p8)	br.cond.spnt 1b;;		// Repeat if DEADLOCK occurred.
 
 	br.ret.sptk	rp
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 7af05a7ac743..0a4ee50c302f 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -5,7 +5,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/init.h>
@@ -20,6 +20,8 @@
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/nodemask.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 
 #include <asm/processor.h>
 #include <asm/irq.h>
@@ -39,12 +41,120 @@
 #include <asm/sn/nodepda.h>
 #include <asm/sn/rw_mmr.h>
 
-void sn2_ptc_deadlock_recovery(volatile unsigned long *, unsigned long data0, 
-	volatile unsigned long *, unsigned long data1);
+DEFINE_PER_CPU(struct ptc_stats, ptcstats);
+DECLARE_PER_CPU(struct ptc_stats, ptcstats);
 
 static  __cacheline_aligned DEFINE_SPINLOCK(sn2_global_ptc_lock);
 
-static unsigned long sn2_ptc_deadlock_count;
+void sn2_ptc_deadlock_recovery(short *, short, int, volatile unsigned long *, unsigned long data0,
+	volatile unsigned long *, unsigned long data1);
+
+#ifdef DEBUG_PTC
+/*
+ * ptctest:
+ *
+ * 	xyz - 3 digit hex number:
+ * 		x - Force PTC purges to use shub:
+ * 			0 - no force
+ * 			1 - force
+ * 		y - interupt enable
+ * 			0 - disable interrupts
+ * 			1 - leave interuupts enabled
+ * 		z - type of lock:
+ * 			0 - global lock
+ * 			1 - node local lock
+ * 			2 - no lock
+ *
+ *   	Note: on shub1, only ptctest == 0 is supported. Don't try other values!
+ */
+
+static unsigned int sn2_ptctest = 0;
+
+static int __init ptc_test(char *str)
+{
+	get_option(&str, &sn2_ptctest);
+	return 1;
+}
+__setup("ptctest=", ptc_test);
+
+static inline int ptc_lock(unsigned long *flagp)
+{
+	unsigned long opt = sn2_ptctest & 255;
+
+	switch (opt) {
+	case 0x00:
+		spin_lock_irqsave(&sn2_global_ptc_lock, *flagp);
+		break;
+	case 0x01:
+		spin_lock_irqsave(&sn_nodepda->ptc_lock, *flagp);
+		break;
+	case 0x02:
+		local_irq_save(*flagp);
+		break;
+	case 0x10:
+		spin_lock(&sn2_global_ptc_lock);
+		break;
+	case 0x11:
+		spin_lock(&sn_nodepda->ptc_lock);
+		break;
+	case 0x12:
+		break;
+	default:
+		BUG();
+	}
+	return opt;
+}
+
+static inline void ptc_unlock(unsigned long flags, int opt)
+{
+	switch (opt) {
+	case 0x00:
+		spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
+		break;
+	case 0x01:
+		spin_unlock_irqrestore(&sn_nodepda->ptc_lock, flags);
+		break;
+	case 0x02:
+		local_irq_restore(flags);
+		break;
+	case 0x10:
+		spin_unlock(&sn2_global_ptc_lock);
+		break;
+	case 0x11:
+		spin_unlock(&sn_nodepda->ptc_lock);
+		break;
+	case 0x12:
+		break;
+	default:
+		BUG();
+	}
+}
+#else
+
+#define sn2_ptctest	0
+
+static inline int ptc_lock(unsigned long *flagp)
+{
+	spin_lock_irqsave(&sn2_global_ptc_lock, *flagp);
+	return 0;
+}
+
+static inline void ptc_unlock(unsigned long flags, int opt)
+{
+	spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
+}
+#endif
+
+struct ptc_stats {
+	unsigned long ptc_l;
+	unsigned long change_rid;
+	unsigned long shub_ptc_flushes;
+	unsigned long nodes_flushed;
+	unsigned long deadlocks;
+	unsigned long lock_itc_clocks;
+	unsigned long shub_itc_clocks;
+	unsigned long shub_itc_clocks_max;
+};
 
 static inline unsigned long wait_piowc(void)
 {
@@ -89,9 +199,9 @@ void
 sn2_global_tlb_purge(unsigned long start, unsigned long end,
 		     unsigned long nbits)
 {
-	int i, shub1, cnode, mynasid, cpu, lcpu = 0, nasid, flushed = 0;
+	int i, opt, shub1, cnode, mynasid, cpu, lcpu = 0, nasid, flushed = 0;
 	volatile unsigned long *ptc0, *ptc1;
-	unsigned long flags = 0, data0 = 0, data1 = 0;
+	unsigned long itc, itc2, flags, data0 = 0, data1 = 0;
 	struct mm_struct *mm = current->active_mm;
 	short nasids[MAX_NUMNODES], nix;
 	nodemask_t nodes_flushed;
@@ -114,16 +224,19 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
 			start += (1UL << nbits);
 		} while (start < end);
 		ia64_srlz_i();
+		__get_cpu_var(ptcstats).ptc_l++;
 		preempt_enable();
 		return;
 	}
 
 	if (atomic_read(&mm->mm_users) == 1) {
 		flush_tlb_mm(mm);
+		__get_cpu_var(ptcstats).change_rid++;
 		preempt_enable();
 		return;
 	}
 
+	itc = ia64_get_itc();
 	nix = 0;
 	for_each_node_mask(cnode, nodes_flushed)
 		nasids[nix++] = cnodeid_to_nasid(cnode);
@@ -148,7 +261,12 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
 
 	mynasid = get_nasid();
 
-	spin_lock_irqsave(&sn2_global_ptc_lock, flags);
+	itc = ia64_get_itc();
+	opt = ptc_lock(&flags);
+	itc2 = ia64_get_itc();
+	__get_cpu_var(ptcstats).lock_itc_clocks += itc2 - itc;
+	__get_cpu_var(ptcstats).shub_ptc_flushes++;
+	__get_cpu_var(ptcstats).nodes_flushed += nix;
 
 	do {
 		if (shub1)
@@ -157,7 +275,7 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
 			data0 = (data0 & ~SH2_PTC_ADDR_MASK) | (start & SH2_PTC_ADDR_MASK);
 		for (i = 0; i < nix; i++) {
 			nasid = nasids[i];
-			if (unlikely(nasid == mynasid)) {
+			if ((!(sn2_ptctest & 3)) && unlikely(nasid == mynasid)) {
 				ia64_ptcga(start, nbits << 2);
 				ia64_srlz_i();
 			} else {
@@ -169,18 +287,22 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
 				flushed = 1;
 			}
 		}
-
 		if (flushed
 		    && (wait_piowc() &
-			SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK)) {
-			sn2_ptc_deadlock_recovery(ptc0, data0, ptc1, data1);
+				(SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK))) {
+			sn2_ptc_deadlock_recovery(nasids, nix, mynasid, ptc0, data0, ptc1, data1);
 		}
 
 		start += (1UL << nbits);
 
 	} while (start < end);
 
-	spin_unlock_irqrestore(&sn2_global_ptc_lock, flags);
+	itc2 = ia64_get_itc() - itc2;
+	__get_cpu_var(ptcstats).shub_itc_clocks += itc2;
+	if (itc2 > __get_cpu_var(ptcstats).shub_itc_clocks_max)
+		__get_cpu_var(ptcstats).shub_itc_clocks_max = itc2;
+
+	ptc_unlock(flags, opt);
 
 	preempt_enable();
 }
@@ -192,31 +314,29 @@ sn2_global_tlb_purge(unsigned long start, unsigned long end,
  * TLB flush transaction.  The recovery sequence is somewhat tricky & is
  * coded in assembly language.
  */
-void sn2_ptc_deadlock_recovery(volatile unsigned long *ptc0, unsigned long data0,
+void sn2_ptc_deadlock_recovery(short *nasids, short nix, int mynasid, volatile unsigned long *ptc0, unsigned long data0,
 	volatile unsigned long *ptc1, unsigned long data1)
 {
 	extern void sn2_ptc_deadlock_recovery_core(volatile unsigned long *, unsigned long,
 	        volatile unsigned long *, unsigned long, volatile unsigned long *, unsigned long);
-	int cnode, mycnode, nasid;
-	volatile unsigned long *piows;
-	volatile unsigned long zeroval;
+	short nasid, i;
+	unsigned long *piows, zeroval;
 
-	sn2_ptc_deadlock_count++;
+	__get_cpu_var(ptcstats).deadlocks++;
 
-	piows = pda->pio_write_status_addr;
+	piows = (unsigned long *) pda->pio_write_status_addr;
 	zeroval = pda->pio_write_status_val;
 
-	mycnode = numa_node_id();
-
-	for_each_online_node(cnode) {
-		if (is_headless_node(cnode) || cnode == mycnode)
+	for (i=0; i < nix; i++) {
+		nasid = nasids[i];
+		if (!(sn2_ptctest & 3) && nasid == mynasid)
 			continue;
-		nasid = cnodeid_to_nasid(cnode);
 		ptc0 = CHANGE_NASID(nasid, ptc0);
 		if (ptc1)
 			ptc1 = CHANGE_NASID(nasid, ptc1);
 		sn2_ptc_deadlock_recovery_core(ptc0, data0, ptc1, data1, piows, zeroval);
 	}
+
 }
 
 /**
@@ -293,3 +413,93 @@ void sn2_send_IPI(int cpuid, int vector, int delivery_mode, int redirect)
 
 	sn_send_IPI_phys(nasid, physid, vector, delivery_mode);
 }
+
+#ifdef CONFIG_PROC_FS
+
+#define PTC_BASENAME	"sgi_sn/ptc_statistics"
+
+static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
+{
+	if (*offset < NR_CPUS)
+		return offset;
+	return NULL;
+}
+
+static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset)
+{
+	(*offset)++;
+	if (*offset < NR_CPUS)
+		return offset;
+	return NULL;
+}
+
+static void sn2_ptc_seq_stop(struct seq_file *file, void *data)
+{
+}
+
+static int sn2_ptc_seq_show(struct seq_file *file, void *data)
+{
+	struct ptc_stats *stat;
+	int cpu;
+
+	cpu = *(loff_t *) data;
+
+	if (!cpu) {
+		seq_printf(file, "# ptc_l change_rid shub_ptc_flushes shub_nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max\n");
+		seq_printf(file, "# ptctest %d\n", sn2_ptctest);
+	}
+
+	if (cpu < NR_CPUS && cpu_online(cpu)) {
+		stat = &per_cpu(ptcstats, cpu);
+		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
+				stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
+				stat->deadlocks,
+				1000 * stat->lock_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
+				1000 * stat->shub_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
+				1000 * stat->shub_itc_clocks_max / per_cpu(cpu_info, cpu).cyc_per_usec);
+	}
+
+	return 0;
+}
+
+static struct seq_operations sn2_ptc_seq_ops = {
+	.start = sn2_ptc_seq_start,
+	.next = sn2_ptc_seq_next,
+	.stop = sn2_ptc_seq_stop,
+	.show = sn2_ptc_seq_show
+};
+
+int sn2_ptc_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sn2_ptc_seq_ops);
+}
+
+static struct file_operations proc_sn2_ptc_operations = {
+	.open = sn2_ptc_proc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static struct proc_dir_entry *proc_sn2_ptc;
+
+static int __init sn2_ptc_init(void)
+{
+	if (!(proc_sn2_ptc = create_proc_entry(PTC_BASENAME, 0444, NULL))) {
+		printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME);
+		return -EINVAL;
+	}
+	proc_sn2_ptc->proc_fops = &proc_sn2_ptc_operations;
+	spin_lock_init(&sn2_global_ptc_lock);
+	return 0;
+}
+
+static void __exit sn2_ptc_exit(void)
+{
+	remove_proc_entry(PTC_BASENAME, NULL);
+}
+
+module_init(sn2_ptc_init);
+module_exit(sn2_ptc_exit);
+#endif /* CONFIG_PROC_FS */
+
diff --git a/include/asm-ia64/sn/nodepda.h b/include/asm-ia64/sn/nodepda.h
index 7138b1eafd6b..47bb8100fd00 100644
--- a/include/asm-ia64/sn/nodepda.h
+++ b/include/asm-ia64/sn/nodepda.h
@@ -37,7 +37,6 @@ struct phys_cpuid {
 
 struct nodepda_s {
 	void 		*pdinfo;	/* Platform-dependent per-node info */
-	spinlock_t		bist_lock;
 
 	/*
 	 * The BTEs on this node are shared by the local cpus
@@ -55,6 +54,8 @@ struct nodepda_s {
 	 * Array of physical cpu identifiers. Indexed by cpuid.
 	 */
 	struct phys_cpuid	phys_cpuid[NR_CPUS];
+	spinlock_t		ptc_lock ____cacheline_aligned_in_smp;
+	spinlock_t		bist_lock;
 };
 
 typedef struct nodepda_s nodepda_t;
-- 
cgit v1.2.3


From e8579e72ca240f3fbaa669f09a9d610436505366 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@hp.com>
Date: Thu, 4 Aug 2005 13:06:00 -0700
Subject: [IA64, X86_64] fix swiotlb sizing

   Fix swiotlb sizing to match what the comments and the kernel
parameters documentation indicate.  Given a default 16k page size kernel
(ia64) and a 2k swiotlb page size, we're off by a multiple of 8 trying
to size the swiotlb.  When specified on the boot line, the swiotlb is
made 8x bigger than requested.  When left to the default value, it's 8x
smaller than the comments indicate.  For x86_64 the multiplier would be
2x.  The patch below fixes this.  Now, what's a good default swiotlb
size?  Apparently we don't really need 64MB.

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/lib/swiotlb.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/lib/swiotlb.c b/arch/ia64/lib/swiotlb.c
index ab7b3ad99a7f..dbc0b3e449c5 100644
--- a/arch/ia64/lib/swiotlb.c
+++ b/arch/ia64/lib/swiotlb.c
@@ -93,8 +93,7 @@ static int __init
 setup_io_tlb_npages(char *str)
 {
 	if (isdigit(*str)) {
-		io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
-			(PAGE_SHIFT - IO_TLB_SHIFT);
+		io_tlb_nslabs = simple_strtoul(str, &str, 0);
 		/* avoid tail segment of size < IO_TLB_SEGSIZE */
 		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
 	}
@@ -117,7 +116,7 @@ swiotlb_init_with_default_size (size_t default_size)
 	unsigned long i;
 
 	if (!io_tlb_nslabs) {
-		io_tlb_nslabs = (default_size >> PAGE_SHIFT);
+		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
 		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
 	}
 
-- 
cgit v1.2.3


From a877bd36f7584fd7e4729099348cfc9190ba00aa Mon Sep 17 00:00:00 2001
From: Kumar Gala <kumar.gala@freescale.com>
Date: Wed, 24 Aug 2005 09:53:00 -0700
Subject: [IA64] remove use of asm/segment.h

Removed IA64 architecture specific users of asm/segment.h
The removal of asm-ia64/segment.h itself can wait until all
of the kernel source has been purged of references.

Signed-off-by: Kumar Gala <kumar.gala@freescale.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/ia32/ia32_signal.c        | 1 -
 arch/ia64/pci/pci.c                 | 1 -
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 1 -
 3 files changed, 3 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/ia32/ia32_signal.c b/arch/ia64/ia32/ia32_signal.c
index ebb89be2aa2d..aa891c9bc9b6 100644
--- a/arch/ia64/ia32/ia32_signal.c
+++ b/arch/ia64/ia32/ia32_signal.c
@@ -29,7 +29,6 @@
 #include <asm/uaccess.h>
 #include <asm/rse.h>
 #include <asm/sigcontext.h>
-#include <asm/segment.h>
 
 #include "ia32priv.h"
 
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 54d9ed444e4a..0addeca5570e 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -24,7 +24,6 @@
 
 #include <asm/machvec.h>
 #include <asm/page.h>
-#include <asm/segment.h>
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/sal.h>
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 833e700fdac9..e28214216fee 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -36,7 +36,6 @@
 #include <asm/topology.h>
 #include <asm/smp.h>
 #include <asm/semaphore.h>
-#include <asm/segment.h>
 #include <asm/uaccess.h>
 #include <asm/sal.h>
 #include <asm/sn/io.h>
-- 
cgit v1.2.3


From 38d26b9f577ec63ba64926c45f4ee3805ed2041c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 24 Aug 2005 03:58:00 -0700
Subject: [IA64] arch/ia64/hp/sim/boot/fw-emu.c: remove egcs workaround

Kernel 2.6 doesn't support egcs, and I didn't find any user of this
function.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/hp/sim/boot/fw-emu.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/hp/sim/boot/fw-emu.c b/arch/ia64/hp/sim/boot/fw-emu.c
index 5c46928e3dc6..30fdfb1d0a53 100644
--- a/arch/ia64/hp/sim/boot/fw-emu.c
+++ b/arch/ia64/hp/sim/boot/fw-emu.c
@@ -237,17 +237,6 @@ sal_emulator (long index, unsigned long in1, unsigned long in2,
 	return ((struct sal_ret_values) {status, r9, r10, r11});
 }
 
-
-/*
- * This is here to work around a bug in egcs-1.1.1b that causes the
- * compiler to crash (seems like a bug in the new alias analysis code.
- */
-void *
-id (long addr)
-{
-	return (void *) addr;
-}
-
 struct ia64_boot_param *
 sys_fw_init (const char *args, int arglen)
 {
-- 
cgit v1.2.3


From 0a41e2501160587eb8f66cef3bdf1c6f2cb86997 Mon Sep 17 00:00:00 2001
From: Peter Chubb <peterc@gelato.unsw.edu.au>
Date: Tue, 16 Aug 2005 19:54:00 -0700
Subject: [IA64] Rationalise Region Definitions

Currently, region numbers are defined in several files, with several
names.  For example, we have REGION_KERNEL in asm/page.h and
RGN_KERNEL in pgtable.h

We also have address definitions that should depend on the
RGN_XXX macros, but are currently just long constants.

The following patch reorganises all the definitions so that they have
the same form (RGN_XXX), are in one place, and that addresses that
depend on RGN_XXX are derived from them.

(This is a necessary but not sufficient patch to allow UML-like
operation on IA64).

Thanks to David Mosberger for catching the change I missed in mmu_context.h.

Signed-off-by: Peter Chubb <peterc@gelato.unsw.edu.au>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/sys_ia64.c    |  2 +-
 arch/ia64/mm/hugetlbpage.c     |  8 ++++----
 include/asm-ia64/io.h          |  2 +-
 include/asm-ia64/mmu_context.h |  7 ++++++-
 include/asm-ia64/page.h        | 27 ++++++++++++++++++---------
 include/asm-ia64/pgtable.h     | 13 +++++--------
 include/asm-ia64/system.h      |  5 +++--
 7 files changed, 38 insertions(+), 26 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 770fab37928e..f2dbcd1db0d4 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -35,7 +35,7 @@ arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len
 		return -ENOMEM;
 
 #ifdef CONFIG_HUGETLB_PAGE
-	if (REGION_NUMBER(addr) == REGION_HPAGE)
+	if (REGION_NUMBER(addr) == RGN_HPAGE)
 		addr = 0;
 #endif
 	if (!addr)
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index e0a776a3044c..2d13889d0a99 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -76,7 +76,7 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
 		return -EINVAL;
-	if (REGION_NUMBER(addr) != REGION_HPAGE)
+	if (REGION_NUMBER(addr) != RGN_HPAGE)
 		return -EINVAL;
 
 	return 0;
@@ -87,7 +87,7 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int writ
 	struct page *page;
 	pte_t *ptep;
 
-	if (REGION_NUMBER(addr) != REGION_HPAGE)
+	if (REGION_NUMBER(addr) != RGN_HPAGE)
 		return ERR_PTR(-EINVAL);
 
 	ptep = huge_pte_offset(mm, addr);
@@ -142,8 +142,8 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 		return -ENOMEM;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
-	/* This code assumes that REGION_HPAGE != 0. */
-	if ((REGION_NUMBER(addr) != REGION_HPAGE) || (addr & (HPAGE_SIZE - 1)))
+	/* This code assumes that RGN_HPAGE != 0. */
+	if ((REGION_NUMBER(addr) != RGN_HPAGE) || (addr & (HPAGE_SIZE - 1)))
 		addr = HPAGE_REGION_BASE;
 	else
 		addr = ALIGN(addr, HPAGE_SIZE);
diff --git a/include/asm-ia64/io.h b/include/asm-ia64/io.h
index 54e7637a326c..3c28eeb0b009 100644
--- a/include/asm-ia64/io.h
+++ b/include/asm-ia64/io.h
@@ -23,7 +23,7 @@
 #define __SLOW_DOWN_IO	do { } while (0)
 #define SLOW_DOWN_IO	do { } while (0)
 
-#define __IA64_UNCACHED_OFFSET	0xc000000000000000UL	/* region 6 */
+#define __IA64_UNCACHED_OFFSET	RGN_BASE(RGN_UNCACHED)
 
 /*
  * The legacy I/O space defined by the ia64 architecture supports only 65536 ports, but
diff --git a/include/asm-ia64/mmu_context.h b/include/asm-ia64/mmu_context.h
index e3e5fededb04..ab60a6a26911 100644
--- a/include/asm-ia64/mmu_context.h
+++ b/include/asm-ia64/mmu_context.h
@@ -19,6 +19,7 @@
 
 #define ia64_rid(ctx,addr)	(((ctx) << 3) | (addr >> 61))
 
+# include <asm/page.h>
 # ifndef __ASSEMBLY__
 
 #include <linux/compiler.h>
@@ -110,7 +111,7 @@ reload_context (mm_context_t context)
 	unsigned long rid_incr = 0;
 	unsigned long rr0, rr1, rr2, rr3, rr4, old_rr4;
 
-	old_rr4 = ia64_get_rr(0x8000000000000000UL);
+	old_rr4 = ia64_get_rr(RGN_BASE(RGN_HPAGE));
 	rid = context << 3;	/* make space for encoding the region number */
 	rid_incr = 1 << 8;
 
@@ -122,6 +123,10 @@ reload_context (mm_context_t context)
 	rr4 = rr0 + 4*rid_incr;
 #ifdef  CONFIG_HUGETLB_PAGE
 	rr4 = (rr4 & (~(0xfcUL))) | (old_rr4 & 0xfc);
+
+#  if RGN_HPAGE != 4
+#    error "reload_context assumes RGN_HPAGE is 4"
+#  endif
 #endif
 
 	ia64_set_rr(0x0000000000000000UL, rr0);
diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 08894f73abf0..ec17f9e9da75 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -12,6 +12,19 @@
 #include <asm/intrinsics.h>
 #include <asm/types.h>
 
+/*
+ * The top three bits of an IA64 address are its Region Number.
+ * Different regions are assigned to different purposes.
+ */
+#define RGN_SHIFT	(61)
+#define RGN_BASE(r)  (__IA64_UL_CONST(r)<<RGN_SHIFT)
+
+#define KHIGH -1	/* high three bits of Kernel virtual address */
+#define RGN_KERNEL	7	/* Identity mapped region */
+#define RGN_UNCACHED    6	/* Identity mapped I/O region */
+#define RGN_GATE	5	/* Gate page, Kernel text, etc */
+#define RGN_HPAGE	4	/* For Huge TLB pages */
+
 /*
  * PAGE_SHIFT determines the actual kernel page size.
  */
@@ -36,10 +49,9 @@
 
 #define RGN_MAP_LIMIT	((1UL << (4*PAGE_SHIFT - 12)) - PAGE_SIZE)	/* per region addr limit */
 
+
 #ifdef CONFIG_HUGETLB_PAGE
-# define REGION_HPAGE		(4UL)	/* note: this is hardcoded in reload_context()!*/
-# define REGION_SHIFT		61
-# define HPAGE_REGION_BASE	(REGION_HPAGE << REGION_SHIFT)
+# define HPAGE_REGION_BASE	RGN_BASE(RGN_HPAGE)
 # define HPAGE_SHIFT		hpage_shift
 # define HPAGE_SHIFT_DEFAULT	28	/* check ia64 SDM for architecture supported size */
 # define HPAGE_SIZE		(__IA64_UL_CONST(1) << HPAGE_SHIFT)
@@ -130,16 +142,13 @@ typedef union ia64_va {
 #define REGION_NUMBER(x)	({ia64_va _v; _v.l = (long) (x); _v.f.reg;})
 #define REGION_OFFSET(x)	({ia64_va _v; _v.l = (long) (x); _v.f.off;})
 
-#define REGION_SIZE		REGION_NUMBER(1)
-#define REGION_KERNEL		7
-
 #ifdef CONFIG_HUGETLB_PAGE
 # define htlbpage_to_page(x)	(((unsigned long) REGION_NUMBER(x) << 61)			\
 				 | (REGION_OFFSET(x) >> (HPAGE_SHIFT-PAGE_SHIFT)))
 # define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
 # define is_hugepage_only_range(mm, addr, len)		\
-	 (REGION_NUMBER(addr) == REGION_HPAGE &&	\
-	  REGION_NUMBER((addr)+(len)-1) == REGION_HPAGE)
+	 (REGION_NUMBER(addr) == RGN_HPAGE &&	\
+	  REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE)
 extern unsigned int hpage_shift;
 #endif
 
@@ -197,7 +206,7 @@ get_order (unsigned long size)
 # define __pgprot(x)	(x)
 #endif /* !STRICT_MM_TYPECHECKS */
 
-#define PAGE_OFFSET			__IA64_UL_CONST(0xe000000000000000)
+#define PAGE_OFFSET			RGN_BASE(RGN_KERNEL)
 
 #define VM_DATA_DEFAULT_FLAGS		(VM_READ | VM_WRITE |					\
 					 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |		\
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 48586e08f432..2e34c06e6777 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -204,21 +204,18 @@ ia64_phys_addr_valid (unsigned long addr)
 #define set_pte(ptep, pteval)	(*(ptep) = (pteval))
 #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
 
-#define RGN_SIZE	(1UL << 61)
-#define RGN_KERNEL	7
-
-#define VMALLOC_START		0xa000000200000000UL
+#define VMALLOC_START		(RGN_BASE(RGN_GATE) + 0x200000000UL)
 #ifdef CONFIG_VIRTUAL_MEM_MAP
-# define VMALLOC_END_INIT	(0xa000000000000000UL + (1UL << (4*PAGE_SHIFT - 9)))
+# define VMALLOC_END_INIT	(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
 # define VMALLOC_END		vmalloc_end
   extern unsigned long vmalloc_end;
 #else
-# define VMALLOC_END		(0xa000000000000000UL + (1UL << (4*PAGE_SHIFT - 9)))
+# define VMALLOC_END		(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
 #endif
 
 /* fs/proc/kcore.c */
-#define	kc_vaddr_to_offset(v) ((v) - 0xa000000000000000UL)
-#define	kc_offset_to_vaddr(o) ((o) + 0xa000000000000000UL)
+#define	kc_vaddr_to_offset(v) ((v) - RGN_BASE(RGN_GATE))
+#define	kc_offset_to_vaddr(o) ((o) + RGN_BASE(RGN_GATE))
 
 /*
  * Conversion functions: convert page frame number (pfn) and a protection value to a page
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index cd2cf76b2db1..33256db4a7cf 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -19,12 +19,13 @@
 #include <asm/pal.h>
 #include <asm/percpu.h>
 
-#define GATE_ADDR		__IA64_UL_CONST(0xa000000000000000)
+#define GATE_ADDR		RGN_BASE(RGN_GATE)
+
 /*
  * 0xa000000000000000+2*PERCPU_PAGE_SIZE
  * - 0xa000000000000000+3*PERCPU_PAGE_SIZE remain unmapped (guard page)
  */
-#define KERNEL_START		 __IA64_UL_CONST(0xa000000100000000)
+#define KERNEL_START		 (GATE_ADDR+0x100000000)
 #define PERCPU_ADDR		(-PERCPU_PAGE_SIZE)
 
 #ifndef __ASSEMBLY__
-- 
cgit v1.2.3


From ecc3c30ae39c4d3cbf249a1ebd599465e0e25708 Mon Sep 17 00:00:00 2001
From: Mark Goodwin <markgw@sgi.com>
Date: Tue, 16 Aug 2005 00:50:00 -0700
Subject: [IA64] - SGI SN hwperf enhancements - export_pci_topology

Bugfix to export PCI topology information in /proc/sgi_sn/sn_topology.

Signed-off-by: Mark Goodwin <markgw@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 50 ++++++++++++-------------------------
 include/asm-ia64/sn/sn_sal.h        |  9 +++----
 2 files changed, 20 insertions(+), 39 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 833e700fdac9..58e48fdf5b47 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -174,29 +174,22 @@ static const char *sn_hwperf_get_slabname(struct sn_hwperf_object_info *obj,
 	return slabname;
 }
 
-static void print_pci_topology(struct seq_file *s,
-	struct sn_hwperf_object_info *obj, int *ordinal,
-	u64 rack, u64 bay, u64 slot, u64 slab)
+static void print_pci_topology(struct seq_file *s)
 {
-	char *p1;
-	char *p2;
-	char *pg;
-
-	if (!(pg = (char *)get_zeroed_page(GFP_KERNEL)))
-		return; /* ignore */
-	if (ia64_sn_ioif_get_pci_topology(rack, bay, slot, slab,
-		__pa(pg), PAGE_SIZE) == SN_HWPERF_OP_OK) {
-		for (p1=pg; *p1 && p1 < pg + PAGE_SIZE;) {
-			if (!(p2 = strchr(p1, '\n')))
-				break;
-			*p2 = '\0';
-			seq_printf(s, "pcibus %d %s-%s\n",
-				*ordinal, obj->location, p1);
-			(*ordinal)++;
-			p1 = p2 + 1;
-		}
+	char *p;
+	size_t sz;
+	int e;
+
+	for (sz = PAGE_SIZE; sz < 16 * PAGE_SIZE; sz += PAGE_SIZE) {
+		if (!(p = (char *)kmalloc(sz, GFP_KERNEL)))
+			break;
+		e = ia64_sn_ioif_get_pci_topology(__pa(p), sz);
+		if (e == SALRET_OK)
+			seq_puts(s, p);
+		kfree(p);
+		if (e == SALRET_OK || e == SALRET_NOT_IMPLEMENTED)
+			break;
 	}
-	free_page((unsigned long)pg);
 }
 
 static int sn_topology_show(struct seq_file *s, void *d)
@@ -215,7 +208,6 @@ static int sn_topology_show(struct seq_file *s, void *d)
 	struct sn_hwperf_object_info *p;
 	struct sn_hwperf_object_info *obj = d;	/* this object */
 	struct sn_hwperf_object_info *objs = s->private; /* all objects */
-	int rack, bay, slot, slab;
 	u8 shubtype;
 	u8 system_size;
 	u8 sharing_size;
@@ -225,7 +217,6 @@ static int sn_topology_show(struct seq_file *s, void *d)
 	u8 region_size;
 	u16 nasid_mask;
 	int nasid_msb;
-	int pci_bus_ordinal = 0;
 
 	if (obj == objs) {
 		seq_printf(s, "# sn_topology version 2\n");
@@ -253,6 +244,8 @@ static int sn_topology_show(struct seq_file *s, void *d)
 			shubtype ? "shub2" : "shub1", 
 			(u64)nasid_mask << nasid_shift, nasid_msb, nasid_shift,
 			system_size, sharing_size, coher, region_size);
+
+		print_pci_topology(s);
 	}
 
 	if (SN_HWPERF_FOREIGN(obj)) {
@@ -300,17 +293,6 @@ static int sn_topology_show(struct seq_file *s, void *d)
 				seq_putc(s, '\n');
 			}
 		}
-
-		/*
-		 * PCI busses attached to this node, if any
-		 */
-		if (sn_hwperf_location_to_bpos(obj->location,
-			&rack, &bay, &slot, &slab)) {
-			/* export pci bus info */
-			print_pci_topology(s, obj, &pci_bus_ordinal,
-				rack, bay, slot, slab);
-
-		}
 	}
 
 	if (obj->ports) {
diff --git a/include/asm-ia64/sn/sn_sal.h b/include/asm-ia64/sn/sn_sal.h
index 99cb9ed34312..02d16e34fd0c 100644
--- a/include/asm-ia64/sn/sn_sal.h
+++ b/include/asm-ia64/sn/sn_sal.h
@@ -78,7 +78,8 @@
 
 #define SN_SAL_HUB_ERROR_INTERRUPT		   0x02000060
 #define SN_SAL_BTE_RECOVER			   0x02000061
-#define SN_SAL_IOIF_GET_PCI_TOPOLOGY	           0x02000062
+#define SN_SAL_RESERVED_DO_NOT_USE		   0x02000062
+#define SN_SAL_IOIF_GET_PCI_TOPOLOGY		   0x02000064
 
 /*
  * Service-specific constants
@@ -1069,12 +1070,10 @@ ia64_sn_hwperf_op(nasid_t nasid, u64 opcode, u64 a0, u64 a1, u64 a2,
 }
 
 static inline int
-ia64_sn_ioif_get_pci_topology(u64 rack, u64 bay, u64 slot, u64 slab,
-			      u64 buf, u64 len)
+ia64_sn_ioif_get_pci_topology(u64 buf, u64 len)
 {
 	struct ia64_sal_retval rv;
-	SAL_CALL_NOLOCK(rv, SN_SAL_IOIF_GET_PCI_TOPOLOGY,
-		rack, bay, slot, slab, buf, len, 0);
+	SAL_CALL_NOLOCK(rv, SN_SAL_IOIF_GET_PCI_TOPOLOGY, buf, len, 0, 0, 0, 0, 0);
 	return (int) rv.status;
 }
 
-- 
cgit v1.2.3


From 60a3ba0bb45995ecf9cfe208527d7cfd6128d053 Mon Sep 17 00:00:00 2001
From: Mark Goodwin <markgw@sgi.com>
Date: Wed, 17 Aug 2005 15:17:00 -0700
Subject: [IA64] - SGI SN hwperf enhancements -

Add a new exported function for determining the nearest node
with CPUs for I/O nodes and fix a bug where the hwperf dynamic
misc device was being registered before misc_init().

Signed-off-by: Mark Goodwin <markgw@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 266 +++++++++++++++++++++++++++++++++---
 include/asm-ia64/sn/sn2/sn_hwperf.h |  10 ++
 2 files changed, 256 insertions(+), 20 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 58e48fdf5b47..0261452d08df 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -59,7 +59,7 @@ static int sn_hwperf_enum_objects(int *nobj, struct sn_hwperf_object_info **ret)
 	struct sn_hwperf_object_info *objbuf = NULL;
 
 	if ((e = sn_hwperf_init()) < 0) {
-		printk("sn_hwperf_init failed: err %d\n", e);
+		printk(KERN_ERR "sn_hwperf_init failed: err %d\n", e);
 		goto out;
 	}
 
@@ -111,7 +111,7 @@ static int sn_hwperf_geoid_to_cnode(char *location)
 	if (sn_hwperf_location_to_bpos(location, &rack, &bay, &slot, &slab))
 		return -1;
 
-	for (cnode = 0; cnode < numionodes; cnode++) {
+	for_each_node(cnode) {
 		geoid = cnodeid_get_geoid(cnode);
 		module_id = geo_module(geoid);
 		this_rack = MODULE_GET_RACK(module_id);
@@ -124,11 +124,13 @@ static int sn_hwperf_geoid_to_cnode(char *location)
 		}
 	}
 
-	return cnode < numionodes ? cnode : -1;
+	return node_possible(cnode) ? cnode : -1;
 }
 
 static int sn_hwperf_obj_to_cnode(struct sn_hwperf_object_info * obj)
 {
+	if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj))
+		BUG();
 	if (!obj->sn_hwp_this_part)
 		return -1;
 	return sn_hwperf_geoid_to_cnode(obj->location);
@@ -192,6 +194,181 @@ static void print_pci_topology(struct seq_file *s)
 	}
 }
 
+static inline int sn_hwperf_has_cpus(cnodeid_t node)
+{
+	return node_online(node) && nr_cpus_node(node);
+}
+
+static inline int sn_hwperf_has_mem(cnodeid_t node)
+{
+	return node_online(node) && NODE_DATA(node)->node_present_pages;
+}
+
+static struct sn_hwperf_object_info *
+sn_hwperf_findobj_id(struct sn_hwperf_object_info *objbuf,
+	int nobj, int id)
+{
+	int i;
+	struct sn_hwperf_object_info *p = objbuf;
+
+	for (i=0; i < nobj; i++, p++) {
+		if (p->id == id)
+			return p;
+	}
+
+	return NULL;
+
+}
+
+static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objbuf,
+	int nobj, cnodeid_t node, cnodeid_t *near_mem_node, cnodeid_t *near_cpu_node)
+{
+	int e;
+	struct sn_hwperf_object_info *nodeobj = NULL;
+	struct sn_hwperf_object_info *op;
+	struct sn_hwperf_object_info *dest;
+	struct sn_hwperf_object_info *router;
+	struct sn_hwperf_port_info ptdata[16];
+	int sz, i, j;
+	cnodeid_t c;
+	int found_mem = 0;
+	int found_cpu = 0;
+
+	if (!node_possible(node))
+		return -EINVAL;
+
+	if (sn_hwperf_has_cpus(node)) {
+		if (near_cpu_node)
+			*near_cpu_node = node;
+		found_cpu++;
+	}
+
+	if (sn_hwperf_has_mem(node)) {
+		if (near_mem_node)
+			*near_mem_node = node;
+		found_mem++;
+	}
+
+	if (found_cpu && found_mem)
+		return 0; /* trivially successful */
+
+	/* find the argument node object */
+	for (i=0, op=objbuf; i < nobj; i++, op++) {
+		if (!SN_HWPERF_IS_NODE(op) && !SN_HWPERF_IS_IONODE(op))
+			continue;
+		if (node == sn_hwperf_obj_to_cnode(op)) {
+			nodeobj = op;
+			break;
+		}
+	}
+	if (!nodeobj) {
+		e = -ENOENT;
+		goto err;
+	}
+
+	/* get it's interconnect topology */
+	sz = op->ports * sizeof(struct sn_hwperf_port_info);
+	if (sz > sizeof(ptdata))
+		BUG();
+	e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
+			      SN_HWPERF_ENUM_PORTS, nodeobj->id, sz,
+			      (u64)&ptdata, 0, 0, NULL);
+	if (e != SN_HWPERF_OP_OK) {
+		e = -EINVAL;
+		goto err;
+	}
+
+	/* find nearest node with cpus and nearest memory */
+	for (router=NULL, j=0; j < op->ports; j++) {
+		dest = sn_hwperf_findobj_id(objbuf, nobj, ptdata[j].conn_id);
+		if (!dest || SN_HWPERF_FOREIGN(dest) ||
+		    !SN_HWPERF_IS_NODE(dest) || SN_HWPERF_IS_IONODE(dest)) {
+			continue;
+		}
+		c = sn_hwperf_obj_to_cnode(dest);
+		if (!found_cpu && sn_hwperf_has_cpus(c)) {
+			if (near_cpu_node)
+				*near_cpu_node = c;
+			found_cpu++;
+		}
+		if (!found_mem && sn_hwperf_has_mem(c)) {
+			if (near_mem_node)
+				*near_mem_node = c;
+			found_mem++;
+		}
+		if (SN_HWPERF_IS_ROUTER(dest))
+			router = dest;
+	}
+
+	if (router && (!found_cpu || !found_mem)) {
+		/* search for a node connected to the same router */
+		sz = router->ports * sizeof(struct sn_hwperf_port_info);
+		if (sz > sizeof(ptdata))
+			BUG();
+		e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
+				      SN_HWPERF_ENUM_PORTS, router->id, sz,
+				      (u64)&ptdata, 0, 0, NULL);
+		if (e != SN_HWPERF_OP_OK) {
+			e = -EINVAL;
+			goto err;
+		}
+		for (j=0; j < router->ports; j++) {
+			dest = sn_hwperf_findobj_id(objbuf, nobj,
+				ptdata[j].conn_id);
+			if (!dest || dest->id == node ||
+			    SN_HWPERF_FOREIGN(dest) ||
+			    !SN_HWPERF_IS_NODE(dest) ||
+			    SN_HWPERF_IS_IONODE(dest)) {
+				continue;
+			}
+			c = sn_hwperf_obj_to_cnode(dest);
+			if (!found_cpu && sn_hwperf_has_cpus(c)) {
+				if (near_cpu_node)
+					*near_cpu_node = c;
+				found_cpu++;
+			}
+			if (!found_mem && sn_hwperf_has_mem(c)) {
+				if (near_mem_node)
+					*near_mem_node = c;
+				found_mem++;
+			}
+			if (found_cpu && found_mem)
+				break;
+		}
+	}
+
+	if (!found_cpu || !found_mem) {
+		/* resort to _any_ node with CPUs and memory */
+		for (i=0, op=objbuf; i < nobj; i++, op++) {
+			if (SN_HWPERF_FOREIGN(op) ||
+			    SN_HWPERF_IS_IONODE(op) ||
+			    !SN_HWPERF_IS_NODE(op)) {
+				continue;
+			}
+			c = sn_hwperf_obj_to_cnode(op);
+			if (!found_cpu && sn_hwperf_has_cpus(c)) {
+				if (near_cpu_node)
+					*near_cpu_node = c;
+				found_cpu++;
+			}
+			if (!found_mem && sn_hwperf_has_mem(c)) {
+				if (near_mem_node)
+					*near_mem_node = c;
+				found_mem++;
+			}
+			if (found_cpu && found_mem)
+				break;
+		}
+	}
+
+	if (!found_cpu || !found_mem)
+		e = -ENODATA;
+
+err:
+	return e;
+}
+
+
 static int sn_topology_show(struct seq_file *s, void *d)
 {
 	int sz;
@@ -265,11 +442,24 @@ static int sn_topology_show(struct seq_file *s, void *d)
 	if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj))
 		seq_putc(s, '\n');
 	else {
+		cnodeid_t near_mem = -1;
+		cnodeid_t near_cpu = -1;
+
 		seq_printf(s, ", nasid 0x%x", cnodeid_to_nasid(ordinal));
-		for (i=0; i < numionodes; i++) {
-			seq_printf(s, i ? ":%d" : ", dist %d",
-				node_distance(ordinal, i));
+
+		if (sn_hwperf_get_nearest_node_objdata(objs, sn_hwperf_obj_cnt,
+			ordinal, &near_mem, &near_cpu) == 0) {
+			seq_printf(s, ", near_mem_nodeid %d, near_cpu_nodeid %d",
+				near_mem, near_cpu);
+		}
+
+		if (!SN_HWPERF_IS_IONODE(obj)) {
+			for_each_online_node(i) {
+				seq_printf(s, i ? ":%d" : ", dist %d",
+					node_distance(ordinal, i));
+			}
 		}
+
 		seq_putc(s, '\n');
 
 		/*
@@ -554,6 +744,8 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
 		if ((r = sn_hwperf_enum_objects(&nobj, &objs)) == 0) {
 			memset(p, 0, a.sz);
 			for (i = 0; i < nobj; i++) {
+				if (!SN_HWPERF_IS_NODE(objs + i))
+					continue;
 				node = sn_hwperf_obj_to_cnode(objs + i);
 				for_each_online_cpu(j) {
 					if (node != cpu_to_node(j))
@@ -580,7 +772,7 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
 
 	case SN_HWPERF_GET_NODE_NASID:
 		if (a.sz != sizeof(u64) ||
-		   (node = a.arg) < 0 || node >= numionodes) {
+		   (node = a.arg) < 0 || !node_possible(node)) {
 			r = -EINVAL;
 			goto error;
 		}
@@ -609,6 +801,14 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
 				vfree(objs);
 				goto error;
 			}
+
+			if (!SN_HWPERF_IS_NODE(objs + i) &&
+			    !SN_HWPERF_IS_IONODE(objs + i)) {
+			    	r = -ENOENT;
+				vfree(objs);
+				goto error;
+			}
+
 			*(u64 *)p = (u64)sn_hwperf_obj_to_cnode(objs + i);
 			vfree(objs);
 		}
@@ -674,6 +874,7 @@ static int sn_hwperf_init(void)
 
 	/* single threaded, once-only initialization */
 	down(&sn_hwperf_init_mutex);
+
 	if (sn_hwperf_salheap) {
 		up(&sn_hwperf_init_mutex);
 		return e;
@@ -724,19 +925,6 @@ out:
 		sn_hwperf_salheap = NULL;
 		sn_hwperf_obj_cnt = 0;
 	}
-
-	if (!e) {
-		/*
-		 * Register a dynamic misc device for ioctl. Platforms
-		 * supporting hotplug will create /dev/sn_hwperf, else
-		 * user can to look up the minor number in /proc/misc.
-		 */
-		if ((e = misc_register(&sn_hwperf_dev)) != 0) {
-			printk(KERN_ERR "sn_hwperf_init: misc register "
-			       "for \"sn_hwperf\" failed, err %d\n", e);
-		}
-	}
-
 	up(&sn_hwperf_init_mutex);
 	return e;
 }
@@ -764,3 +952,41 @@ int sn_topology_release(struct inode *inode, struct file *file)
 	vfree(seq->private);
 	return seq_release(inode, file);
 }
+
+int sn_hwperf_get_nearest_node(cnodeid_t node,
+	cnodeid_t *near_mem_node, cnodeid_t *near_cpu_node)
+{
+	int e;
+	int nobj;
+	struct sn_hwperf_object_info *objbuf;
+
+	if ((e = sn_hwperf_enum_objects(&nobj, &objbuf)) == 0) {
+		e = sn_hwperf_get_nearest_node_objdata(objbuf, nobj,
+			node, near_mem_node, near_cpu_node);
+		vfree(objbuf);
+	}
+
+	return e;
+}
+
+static int __devinit sn_hwperf_misc_register_init(void)
+{
+	int e;
+
+	sn_hwperf_init();
+
+	/*
+	 * Register a dynamic misc device for hwperf ioctls. Platforms
+	 * supporting hotplug will create /dev/sn_hwperf, else user
+	 * can to look up the minor number in /proc/misc.
+	 */
+	if ((e = misc_register(&sn_hwperf_dev)) != 0) {
+		printk(KERN_ERR "sn_hwperf_misc_register_init: failed to "
+		"register misc device for \"%s\"\n", sn_hwperf_dev.name);
+	}
+
+	return e;
+}
+
+device_initcall(sn_hwperf_misc_register_init); /* after misc_init() */
+EXPORT_SYMBOL(sn_hwperf_get_nearest_node);
diff --git a/include/asm-ia64/sn/sn2/sn_hwperf.h b/include/asm-ia64/sn/sn2/sn_hwperf.h
index df75f4c4aec3..291ef3d69da2 100644
--- a/include/asm-ia64/sn/sn2/sn_hwperf.h
+++ b/include/asm-ia64/sn/sn2/sn_hwperf.h
@@ -43,6 +43,7 @@ struct sn_hwperf_object_info {
 
 /* macros for object classification */
 #define SN_HWPERF_IS_NODE(x)		((x) && strstr((x)->name, "SHub"))
+#define SN_HWPERF_IS_NODE_SHUB2(x)	((x) && strstr((x)->name, "SHub 2."))
 #define SN_HWPERF_IS_IONODE(x)		((x) && strstr((x)->name, "TIO"))
 #define SN_HWPERF_IS_ROUTER(x)		((x) && strstr((x)->name, "Router"))
 #define SN_HWPERF_IS_NL3ROUTER(x)	((x) && strstr((x)->name, "NL3Router"))
@@ -214,6 +215,15 @@ struct sn_hwperf_ioctl_args {
  */
 #define SN_HWPERF_GET_NODE_NASID	(102|SN_HWPERF_OP_MEM_COPYOUT)
 
+/*
+ * Given a node id, determine the id of the nearest node with CPUs
+ * and the id of the nearest node that has memory. The argument
+ * node would normally be a "headless" node, e.g. an "IO node".
+ * Return 0 on success.
+ */
+extern int sn_hwperf_get_nearest_node(cnodeid_t node,
+	cnodeid_t *near_mem, cnodeid_t *near_cpu);
+
 /* return codes */
 #define SN_HWPERF_OP_OK			0
 #define SN_HWPERF_OP_NOMEM		1
-- 
cgit v1.2.3


From 5390970d1c11b6d5d6a8253a718ed100e2178e14 Mon Sep 17 00:00:00 2001
From: Mark Goodwin <markgw@sgi.com>
Date: Tue, 16 Aug 2005 00:51:00 -0700
Subject: [IA64] - SGI SN hwperf enhancements -

Update the SN pci device info to use the nearest node function
to allocate driver memory on the nearest node (rather than
defaulting to node 0).

Signed-off-by: Mark Goodwin <markgw@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/pci/pcibr/pcibr_provider.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/pci/pcibr/pcibr_provider.c b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
index ff9c7de925d1..5862a709adf5 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_provider.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
@@ -15,6 +15,7 @@
 #include <asm/sn/pcibus_provider_defs.h>
 #include <asm/sn/pcidev.h>
 #include <asm/sn/sn_sal.h>
+#include <asm/sn/sn2/sn_hwperf.h>
 #include "xtalk/xwidgetdev.h"
 #include "xtalk/hubdev.h"
 
@@ -88,6 +89,7 @@ void *
 pcibr_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller)
 {
 	int nasid, cnode, j;
+	cnodeid_t near_cnode;
 	struct hubdev_info *hubdev_info;
 	struct pcibus_info *soft;
 	struct sn_flush_device_list *sn_flush_device_list;
@@ -161,12 +163,18 @@ pcibr_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	memset(soft->pbi_int_ate_resource.ate, 0,
  	       (soft->pbi_int_ate_size * sizeof(uint64_t)));
 
-	if (prom_bussoft->bs_asic_type == PCIIO_ASIC_TYPE_TIOCP)
-		/*
-		 * TIO PCI Bridge with no closest node information.
-		 * FIXME: Find another way to determine the closest node
-		 */
-		controller->node = -1;
+	if (prom_bussoft->bs_asic_type == PCIIO_ASIC_TYPE_TIOCP) {
+		/* TIO PCI Bridge: find nearest node with CPUs */
+		int e = sn_hwperf_get_nearest_node(cnode, NULL, &near_cnode);
+
+		if (e < 0) {
+			near_cnode = (cnodeid_t)-1; /* use any node */
+			printk(KERN_WARNING "pcibr_bus_fixup: failed to find "
+				"near node with CPUs to TIO node %d, err=%d\n",
+				cnode, e);
+		}
+		controller->node = near_cnode;
+	}
 	else
 		controller->node = cnode;
 	return soft;
-- 
cgit v1.2.3


From 5b9021bc5800796e23e4994f8cf2dc61536be0a7 Mon Sep 17 00:00:00 2001
From: Russ Anderson <(rja@sgi.com)>
Date: Wed, 17 Aug 2005 10:00:00 -0700
Subject: [IA64] SGI SN remove redundant partition SAL call

Clean up of SGI SN partitioning related code.
The SN_SAL_GET_SN_INFO SAL call returns the partition ID, making
the SN_SAL_SYSCTL_PARTITION_GET SAL call redundant.  Remove sn_partid
and use sn_partition_id.

Signed-off-by: Russ Anderson (rja@sgi.com)
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/setup.c          |  2 --
 arch/ia64/sn/kernel/sn2/sn_proc_fs.c |  4 +--
 include/asm-ia64/sn/geo.h            |  3 +--
 include/asm-ia64/sn/sn_sal.h         | 48 ------------------------------------
 4 files changed, 3 insertions(+), 54 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 6648deb778a6..a594aca959e6 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -80,8 +80,6 @@ EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid);
 DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda);
 EXPORT_PER_CPU_SYMBOL(__sn_nodepda);
 
-partid_t sn_partid = -1;
-EXPORT_SYMBOL(sn_partid);
 char sn_system_serial_number_string[128];
 EXPORT_SYMBOL(sn_system_serial_number_string);
 u64 sn_partition_serial_number;
diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
index 6a80fca807b9..51bf82720d99 100644
--- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
+++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
  */
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -15,7 +15,7 @@
 
 static int partition_id_show(struct seq_file *s, void *p)
 {
-	seq_printf(s, "%d\n", sn_local_partid());
+	seq_printf(s, "%d\n", sn_partition_id);
 	return 0;
 }
 
diff --git a/include/asm-ia64/sn/geo.h b/include/asm-ia64/sn/geo.h
index 84b254603b8d..f083c9434066 100644
--- a/include/asm-ia64/sn/geo.h
+++ b/include/asm-ia64/sn/geo.h
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 1992 - 1997, 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved.
  */
 
 #ifndef _ASM_IA64_SN_GEO_H
@@ -108,7 +108,6 @@ typedef union geoid_u {
 #define INVALID_SLAB            (slabid_t)-1
 #define INVALID_SLOT            (slotid_t)-1
 #define INVALID_MODULE          ((moduleid_t)-1)
-#define INVALID_PARTID          ((partid_t)-1)
 
 static inline slabid_t geo_slab(geoid_t g)
 {
diff --git a/include/asm-ia64/sn/sn_sal.h b/include/asm-ia64/sn/sn_sal.h
index 02d16e34fd0c..e67825ad1930 100644
--- a/include/asm-ia64/sn/sn_sal.h
+++ b/include/asm-ia64/sn/sn_sal.h
@@ -55,7 +55,6 @@
 #define  SN_SAL_BUS_CONFIG		   	   0x02000037
 #define  SN_SAL_SYS_SERIAL_GET			   0x02000038
 #define  SN_SAL_PARTITION_SERIAL_GET		   0x02000039
-#define  SN_SAL_SYSCTL_PARTITION_GET		   0x0200003a
 #define  SN_SAL_SYSTEM_POWER_DOWN		   0x0200003b
 #define  SN_SAL_GET_MASTER_BASEIO_NASID		   0x0200003c
 #define  SN_SAL_COHERENCE                          0x0200003d
@@ -586,35 +585,6 @@ sn_partition_serial_number_val(void) {
 	return sn_partition_serial_number;
 }
 
-/*
- * Returns the partition id of the nasid passed in as an argument,
- * or INVALID_PARTID if the partition id cannot be retrieved.
- */
-static inline partid_t
-ia64_sn_sysctl_partition_get(nasid_t nasid)
-{
-	struct ia64_sal_retval ret_stuff;
-	ia64_sal_oemcall_nolock(&ret_stuff, SN_SAL_SYSCTL_PARTITION_GET, nasid,
-				0, 0, 0, 0, 0, 0);
-	if (ret_stuff.status != 0)
-	    return INVALID_PARTID;
-	return ((partid_t)ret_stuff.v0);
-}
-
-/*
- * Returns the partition id of the current processor.
- */
-
-extern partid_t sn_partid;
-
-static inline partid_t
-sn_local_partid(void) {
-	if (unlikely(sn_partid < 0)) {
-		sn_partid = ia64_sn_sysctl_partition_get(cpuid_to_nasid(smp_processor_id()));
-	}
-	return sn_partid;
-}
-
 /*
  * Returns the physical address of the partition's reserved page through
  * an iterative number of calls.
@@ -1020,24 +990,6 @@ ia64_sn_get_sn_info(int fc, u8 *shubtype, u16 *nasid_bitmask, u8 *nasid_shift,
 	ret_stuff.v2 = 0;
 	SAL_CALL_NOLOCK(ret_stuff, SN_SAL_GET_SN_INFO, fc, 0, 0, 0, 0, 0, 0);
 
-/***** BEGIN HACK - temp til old proms no longer supported ********/
-	if (ret_stuff.status == SALRET_NOT_IMPLEMENTED) {
-		int nasid = get_sapicid() & 0xfff;;
-#define SH_SHUB_ID_NODES_PER_BIT_MASK 0x001f000000000000UL                                               
-#define SH_SHUB_ID_NODES_PER_BIT_SHFT 48                                                               
-		if (shubtype) *shubtype = 0;
-		if (nasid_bitmask) *nasid_bitmask = 0x7ff;
-		if (nasid_shift) *nasid_shift = 38;
-		if (systemsize) *systemsize = 11;
-		if (sharing_domain_size) *sharing_domain_size = 9;
-		if (partid) *partid = ia64_sn_sysctl_partition_get(nasid);
-		if (coher) *coher = nasid >> 9;
-		if (reg) *reg = (HUB_L((u64 *) LOCAL_MMR_ADDR(SH1_SHUB_ID)) & SH_SHUB_ID_NODES_PER_BIT_MASK) >>
-			SH_SHUB_ID_NODES_PER_BIT_SHFT;
-		return 0;
-	}
-/***** END HACK *******/
-
 	if (ret_stuff.status < 0)
 		return ret_stuff.status;
 
-- 
cgit v1.2.3


From 8409668b561fbe464f7a392e8dc77eca225d27ac Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Thu, 25 Aug 2005 11:45:00 -0700
Subject: [IA64] altix: Abstract irq_affinity at the sn pci provider

Altix patch to abstract irq_affinity down to the pci provider level since
different SGI hardware implements this in different ways.

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/irq.c                  |  6 +++--
 arch/ia64/sn/pci/pcibr/pcibr_provider.c    |  5 ++--
 arch/ia64/sn/pci/tioca_provider.c          |  3 ++-
 arch/ia64/sn/pci/tioce_provider.c          | 40 +++++++++++++++++++++++++++++-
 include/asm-ia64/sn/pcibus_provider_defs.h |  1 +
 5 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 607938c288bb..9fc74631ba8a 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -127,6 +127,7 @@ static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
 		int local_widget, status;
 		nasid_t local_nasid;
 		struct sn_irq_info *new_irq_info;
+		struct sn_pcibus_provider *pci_provider;
 
 		new_irq_info = kmalloc(sizeof(struct sn_irq_info), GFP_ATOMIC);
 		if (new_irq_info == NULL)
@@ -166,8 +167,9 @@ static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
 		new_irq_info->irq_cpuid = cpuid;
 		register_intr_pda(new_irq_info);
 
-		if (IS_PCI_BRIDGE_ASIC(new_irq_info->irq_bridge_type))
-			pcibr_change_devices_irq(new_irq_info);
+		pci_provider = sn_pci_provider[new_irq_info->irq_bridge_type];
+		if (pci_provider && pci_provider->target_interrupt)
+			(pci_provider->target_interrupt)(new_irq_info);
 
 		spin_lock(&sn_irq_info_lock);
 		list_replace_rcu(&sn_irq_info->list, &new_irq_info->list);
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_provider.c b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
index 5862a709adf5..7b03b8084ffc 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_provider.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_provider.c
@@ -198,7 +198,7 @@ void pcibr_force_interrupt(struct sn_irq_info *sn_irq_info)
 	}
 }
 
-void pcibr_change_devices_irq(struct sn_irq_info *sn_irq_info)
+void pcibr_target_interrupt(struct sn_irq_info *sn_irq_info)
 {
 	struct pcidev_info *pcidev_info;
 	struct pcibus_info *pcibus_info;
@@ -233,7 +233,8 @@ struct sn_pcibus_provider pcibr_provider = {
 	.dma_map_consistent = pcibr_dma_map_consistent,
 	.dma_unmap = pcibr_dma_unmap,
 	.bus_fixup = pcibr_bus_fixup,
-	.force_interrupt = pcibr_force_interrupt
+	.force_interrupt = pcibr_force_interrupt,
+	.target_interrupt = pcibr_target_interrupt
 };
 
 int
diff --git a/arch/ia64/sn/pci/tioca_provider.c b/arch/ia64/sn/pci/tioca_provider.c
index 4ea04cfa30ff..ea09c12f0258 100644
--- a/arch/ia64/sn/pci/tioca_provider.c
+++ b/arch/ia64/sn/pci/tioca_provider.c
@@ -657,7 +657,8 @@ static struct sn_pcibus_provider tioca_pci_interfaces = {
 	.dma_map_consistent = tioca_dma_map,
 	.dma_unmap = tioca_dma_unmap,
 	.bus_fixup = tioca_bus_fixup,
-	.force_interrupt = NULL
+	.force_interrupt = NULL,
+	.target_interrupt = NULL
 };
 
 /**
diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index d9081369cf96..8e75db2b825d 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -668,6 +668,43 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
 	ce_mmr->ce_adm_force_int = force_int_val;
 }
 
+/**
+ * tioce_target_interrupt - implement set_irq_affinity for tioce resident
+ * functions.  Note:  only applies to line interrupts, not MSI's.
+ *
+ * @sn_irq_info: SN IRQ context
+ *
+ * Given an sn_irq_info, set the associated CE device's interrupt destination
+ * register.  Since the interrupt destination registers are on a per-ce-slot
+ * basis, this will retarget line interrupts for all functions downstream of
+ * the slot.
+ */
+static void
+tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
+{
+	struct pcidev_info *pcidev_info;
+	struct tioce_common *ce_common;
+	struct tioce *ce_mmr;
+	int bit;
+
+	pcidev_info = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
+	if (!pcidev_info)
+		return;
+
+	ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
+	ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
+
+	bit = sn_irq_info->irq_int_bit;
+
+	ce_mmr->ce_adm_int_mask |= (1UL << bit);
+	ce_mmr->ce_adm_int_dest[bit] =
+		((uint64_t)sn_irq_info->irq_irq << INTR_VECTOR_SHFT) |
+			   sn_irq_info->irq_xtalkaddr;
+	ce_mmr->ce_adm_int_mask &= ~(1UL << bit);
+
+	tioce_force_interrupt(sn_irq_info);
+}
+
 /**
  * tioce_bus_fixup - perform final PCI fixup for a TIO CE bus
  * @prom_bussoft: Common prom/kernel struct representing the bus
@@ -719,7 +756,8 @@ static struct sn_pcibus_provider tioce_pci_interfaces = {
 	.dma_map_consistent = tioce_dma_consistent,
 	.dma_unmap = tioce_dma_unmap,
 	.bus_fixup = tioce_bus_fixup,
-	.force_interrupt = tioce_force_interrupt
+	.force_interrupt = tioce_force_interrupt,
+	.target_interrupt = tioce_target_interrupt
 };
 
 /**
diff --git a/include/asm-ia64/sn/pcibus_provider_defs.h b/include/asm-ia64/sn/pcibus_provider_defs.h
index 5a92f5149a94..ad0e8e8ae53f 100644
--- a/include/asm-ia64/sn/pcibus_provider_defs.h
+++ b/include/asm-ia64/sn/pcibus_provider_defs.h
@@ -50,6 +50,7 @@ struct sn_pcibus_provider {
 	void		(*dma_unmap)(struct pci_dev *, dma_addr_t, int);
 	void *		(*bus_fixup)(struct pcibus_bussoft *, struct pci_controller *);
  	void		(*force_interrupt)(struct sn_irq_info *);
+ 	void		(*target_interrupt)(struct sn_irq_info *);
 };
 
 extern struct sn_pcibus_provider *sn_pci_provider[];
-- 
cgit v1.2.3


From d1e079b3fc90c7c114f46771e983a72ac8740882 Mon Sep 17 00:00:00 2001
From: Russ Anderson <(rja@sgi.com)>
Date: Mon, 15 Aug 2005 14:46:00 -0700
Subject: [IA64-SGI] fix bte_copy() calling smp_processor_id() while
 preemptible

bte_copy() calls calls smp_processor_id(), which will get flagged if
preemption if enabled.  raw_smp_processor_id() is used instead
because we are just using it to pick a BTE interface and are not
tied to a specific cpu.

Signed-off-by: Russ Anderson (rja@sgi.com)
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/bte.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/bte.c b/arch/ia64/sn/kernel/bte.c
index b75814efadba..45854c637e9c 100644
--- a/arch/ia64/sn/kernel/bte.c
+++ b/arch/ia64/sn/kernel/bte.c
@@ -105,7 +105,7 @@ bte_result_t bte_copy(u64 src, u64 dest, u64 len, u64 mode, void *notification)
 	/*
 	 * Start with interface corresponding to cpu number
 	 */
-	bte_first = get_cpu() % btes_per_node;
+	bte_first = raw_smp_processor_id() % btes_per_node;
 
 	if (mode & BTE_USE_DEST) {
 		/* try remote then local */
-- 
cgit v1.2.3


From 4db8699bcfa8faddb5727b1cb010a4d9b8a42e8c Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Fri, 29 Jul 2005 16:15:00 -0700
Subject: [IA64] Add ACPI based P-state support

Patch to support P-state transitions on ia64. This driver is based on ACPI,
and uses the ACPI processor driver interface to find out the P-state support
information for the processor. This driver plugs into generic cpufreq
infrastructure.

Once this driver is loaded successfully, ondemand/userspace governor can be
used to change the CPU frequency dynamically based on load or on request from
userspace process.

Refer :
ACPI specification -
      http://www.acpi.info
P-state related PAL calls -
      http://developer.intel.com/design/itanium/downloads/24869909.pdf

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/Kconfig                       |   6 +
 arch/ia64/kernel/Makefile               |   1 +
 arch/ia64/kernel/cpufreq/Kconfig        |  29 ++
 arch/ia64/kernel/cpufreq/Makefile       |   1 +
 arch/ia64/kernel/cpufreq/acpi-cpufreq.c | 499 ++++++++++++++++++++++++++++++++
 include/asm-ia64/acpi.h                 |   5 +
 include/asm-ia64/pal.h                  |  21 ++
 7 files changed, 562 insertions(+)
 create mode 100644 arch/ia64/kernel/cpufreq/Kconfig
 create mode 100644 arch/ia64/kernel/cpufreq/Makefile
 create mode 100644 arch/ia64/kernel/cpufreq/acpi-cpufreq.c

(limited to 'arch/ia64')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 80988136f26d..3deced637f07 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -383,6 +383,12 @@ source "drivers/acpi/Kconfig"
 
 endif
 
+if PM
+
+source "arch/ia64/kernel/cpufreq/Kconfig"
+
+endif
+
 endmenu
 
 if !IA64_HP_SIM
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index e1fb68ddec26..b242594be55b 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
+obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o jprobes.o
 obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR)	+= uncached.o
diff --git a/arch/ia64/kernel/cpufreq/Kconfig b/arch/ia64/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..2d9d5279b981
--- /dev/null
+++ b/arch/ia64/kernel/cpufreq/Kconfig
@@ -0,0 +1,29 @@
+
+#
+# CPU Frequency scaling
+#
+
+menu "CPU Frequency scaling"
+
+source "drivers/cpufreq/Kconfig"
+
+if CPU_FREQ
+
+comment "CPUFreq processor drivers"
+
+config IA64_ACPI_CPUFREQ
+	tristate "ACPI Processor P-States driver"
+	select CPU_FREQ_TABLE
+	depends on ACPI_PROCESSOR
+	help
+	This driver adds a CPUFreq driver which utilizes the ACPI
+	Processor Performance States.
+
+	For details, take a look at <file:Documentation/cpu-freq/>.
+
+	If in doubt, say N.
+
+endif   # CPU_FREQ
+
+endmenu
+
diff --git a/arch/ia64/kernel/cpufreq/Makefile b/arch/ia64/kernel/cpufreq/Makefile
new file mode 100644
index 000000000000..f748d34c02f0
--- /dev/null
+++ b/arch/ia64/kernel/cpufreq/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IA64_ACPI_CPUFREQ)		+= acpi-cpufreq.o
diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
new file mode 100644
index 000000000000..da4d5cf80a48
--- /dev/null
+++ b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
@@ -0,0 +1,499 @@
+/*
+ * arch/ia64/kernel/cpufreq/acpi-cpufreq.c
+ * This file provides the ACPI based P-state support. This
+ * module works with generic cpufreq infrastructure. Most of
+ * the code is based on i386 version
+ * (arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c)
+ *
+ * Copyright (C) 2005 Intel Corp
+ *      Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/pal.h>
+
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+
+#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
+
+MODULE_AUTHOR("Venkatesh Pallipadi");
+MODULE_DESCRIPTION("ACPI Processor P-States Driver");
+MODULE_LICENSE("GPL");
+
+
+struct cpufreq_acpi_io {
+	struct acpi_processor_performance	acpi_data;
+	struct cpufreq_frequency_table		*freq_table;
+	unsigned int				resume;
+};
+
+static struct cpufreq_acpi_io	*acpi_io_data[NR_CPUS];
+
+static struct cpufreq_driver acpi_cpufreq_driver;
+
+
+static int
+processor_set_pstate (
+	u32	value)
+{
+	s64 retval;
+
+	dprintk("processor_set_pstate\n");
+
+	retval = ia64_pal_set_pstate((u64)value);
+
+	if (retval) {
+		dprintk("Failed to set freq to 0x%x, with error 0x%x\n",
+		        value, retval);
+		return -ENODEV;
+	}
+	return (int)retval;
+}
+
+
+static int
+processor_get_pstate (
+	u32	*value)
+{
+	u64	pstate_index = 0;
+	s64 	retval;
+
+	dprintk("processor_get_pstate\n");
+
+	retval = ia64_pal_get_pstate(&pstate_index);
+	*value = (u32) pstate_index;
+
+	if (retval)
+		dprintk("Failed to get current freq with "
+		        "error 0x%x, idx 0x%x\n", retval, *value);
+
+	return (int)retval;
+}
+
+
+/* To be used only after data->acpi_data is initialized */
+static unsigned
+extract_clock (
+	struct cpufreq_acpi_io *data,
+	unsigned value,
+	unsigned int cpu)
+{
+	unsigned long i;
+
+	dprintk("extract_clock\n");
+
+	for (i = 0; i < data->acpi_data.state_count; i++) {
+		if (value >= data->acpi_data.states[i].control)
+			return data->acpi_data.states[i].core_frequency;
+	}
+	return data->acpi_data.states[i-1].core_frequency;
+}
+
+
+static unsigned int
+processor_get_freq (
+	struct cpufreq_acpi_io	*data,
+	unsigned int		cpu)
+{
+	int			ret = 0;
+	u32			value = 0;
+	cpumask_t		saved_mask;
+	unsigned long 		clock_freq;
+
+	dprintk("processor_get_freq\n");
+
+	saved_mask = current->cpus_allowed;
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	if (smp_processor_id() != cpu) {
+		ret = -EAGAIN;
+		goto migrate_end;
+	}
+
+	/*
+	 * processor_get_pstate gets the average frequency since the
+	 * last get. So, do two PAL_get_freq()...
+	 */
+	ret = processor_get_pstate(&value);
+	ret = processor_get_pstate(&value);
+
+	if (ret) {
+		set_cpus_allowed(current, saved_mask);
+		printk(KERN_WARNING "get performance failed with error %d\n",
+		       ret);
+		ret = -EAGAIN;
+		goto migrate_end;
+	}
+	clock_freq = extract_clock(data, value, cpu);
+	ret = (clock_freq*1000);
+
+migrate_end:
+	set_cpus_allowed(current, saved_mask);
+	return ret;
+}
+
+
+static int
+processor_set_freq (
+	struct cpufreq_acpi_io	*data,
+	unsigned int		cpu,
+	int			state)
+{
+	int			ret = 0;
+	u32			value = 0;
+	struct cpufreq_freqs    cpufreq_freqs;
+	cpumask_t		saved_mask;
+	int			retval;
+
+	dprintk("processor_set_freq\n");
+
+	saved_mask = current->cpus_allowed;
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	if (smp_processor_id() != cpu) {
+		retval = -EAGAIN;
+		goto migrate_end;
+	}
+
+	if (state == data->acpi_data.state) {
+		if (unlikely(data->resume)) {
+			dprintk("Called after resume, resetting to P%d\n", state);
+			data->resume = 0;
+		} else {
+			dprintk("Already at target state (P%d)\n", state);
+			retval = 0;
+			goto migrate_end;
+		}
+	}
+
+	dprintk("Transitioning from P%d to P%d\n",
+		data->acpi_data.state, state);
+
+	/* cpufreq frequency struct */
+	cpufreq_freqs.cpu = cpu;
+	cpufreq_freqs.old = data->freq_table[data->acpi_data.state].frequency;
+	cpufreq_freqs.new = data->freq_table[state].frequency;
+
+	/* notify cpufreq */
+	cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
+
+	/*
+	 * First we write the target state's 'control' value to the
+	 * control_register.
+	 */
+
+	value = (u32) data->acpi_data.states[state].control;
+
+	dprintk("Transitioning to state: 0x%08x\n", value);
+
+	ret = processor_set_pstate(value);
+	if (ret) {
+		unsigned int tmp = cpufreq_freqs.new;
+		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
+		cpufreq_freqs.new = cpufreq_freqs.old;
+		cpufreq_freqs.old = tmp;
+		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
+		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
+		printk(KERN_WARNING "Transition failed with error %d\n", ret);
+		retval = -ENODEV;
+		goto migrate_end;
+	}
+
+	cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
+
+	data->acpi_data.state = state;
+
+	retval = 0;
+
+migrate_end:
+	set_cpus_allowed(current, saved_mask);
+	return (retval);
+}
+
+
+static unsigned int
+acpi_cpufreq_get (
+	unsigned int		cpu)
+{
+	struct cpufreq_acpi_io *data = acpi_io_data[cpu];
+
+	dprintk("acpi_cpufreq_get\n");
+
+	return processor_get_freq(data, cpu);
+}
+
+
+static int
+acpi_cpufreq_target (
+	struct cpufreq_policy   *policy,
+	unsigned int target_freq,
+	unsigned int relation)
+{
+	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
+	unsigned int next_state = 0;
+	unsigned int result = 0;
+
+	dprintk("acpi_cpufreq_setpolicy\n");
+
+	result = cpufreq_frequency_table_target(policy,
+			data->freq_table, target_freq, relation, &next_state);
+	if (result)
+		return (result);
+
+	result = processor_set_freq(data, policy->cpu, next_state);
+
+	return (result);
+}
+
+
+static int
+acpi_cpufreq_verify (
+	struct cpufreq_policy   *policy)
+{
+	unsigned int result = 0;
+	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
+
+	dprintk("acpi_cpufreq_verify\n");
+
+	result = cpufreq_frequency_table_verify(policy,
+			data->freq_table);
+
+	return (result);
+}
+
+
+/*
+ * processor_init_pdc - let BIOS know about the SMP capabilities
+ * of this driver
+ * @perf: processor-specific acpi_io_data struct
+ * @cpu: CPU being initialized
+ *
+ * To avoid issues with legacy OSes, some BIOSes require to be informed of
+ * the SMP capabilities of OS P-state driver. Here we set the bits in _PDC
+ * accordingly. Actual call to _PDC is done in driver/acpi/processor.c
+ */
+static void
+processor_init_pdc (
+		struct acpi_processor_performance *perf,
+		unsigned int cpu,
+		struct acpi_object_list *obj_list
+		)
+{
+	union acpi_object *obj;
+	u32 *buf;
+
+	dprintk("processor_init_pdc\n");
+
+	perf->pdc = NULL;
+	/* Initialize pdc. It will be used later. */
+	if (!obj_list)
+		return;
+
+	if (!(obj_list->count && obj_list->pointer))
+		return;
+
+	obj = obj_list->pointer;
+	if ((obj->buffer.length == 12) && obj->buffer.pointer) {
+		buf = (u32 *)obj->buffer.pointer;
+       		buf[0] = ACPI_PDC_REVISION_ID;
+       		buf[1] = 1;
+       		buf[2] = ACPI_PDC_EST_CAPABILITY_SMP;
+		perf->pdc = obj_list;
+	}
+	return;
+}
+
+
+static int
+acpi_cpufreq_cpu_init (
+	struct cpufreq_policy   *policy)
+{
+	unsigned int		i;
+	unsigned int		cpu = policy->cpu;
+	struct cpufreq_acpi_io	*data;
+	unsigned int		result = 0;
+
+	union acpi_object		arg0 = {ACPI_TYPE_BUFFER};
+	u32				arg0_buf[3];
+	struct acpi_object_list 	arg_list = {1, &arg0};
+
+	dprintk("acpi_cpufreq_cpu_init\n");
+	/* setup arg_list for _PDC settings */
+        arg0.buffer.length = 12;
+        arg0.buffer.pointer = (u8 *) arg0_buf;
+
+	data = kmalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL);
+	if (!data)
+		return (-ENOMEM);
+
+	memset(data, 0, sizeof(struct cpufreq_acpi_io));
+
+	acpi_io_data[cpu] = data;
+
+	processor_init_pdc(&data->acpi_data, cpu, &arg_list);
+	result = acpi_processor_register_performance(&data->acpi_data, cpu);
+	data->acpi_data.pdc = NULL;
+
+	if (result)
+		goto err_free;
+
+	/* capability check */
+	if (data->acpi_data.state_count <= 1) {
+		dprintk("No P-States\n");
+		result = -ENODEV;
+		goto err_unreg;
+	}
+
+	if ((data->acpi_data.control_register.space_id !=
+					ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+	    (data->acpi_data.status_register.space_id !=
+					ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+		dprintk("Unsupported address space [%d, %d]\n",
+			(u32) (data->acpi_data.control_register.space_id),
+			(u32) (data->acpi_data.status_register.space_id));
+		result = -ENODEV;
+		goto err_unreg;
+	}
+
+	/* alloc freq_table */
+	data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
+	                           (data->acpi_data.state_count + 1),
+	                           GFP_KERNEL);
+	if (!data->freq_table) {
+		result = -ENOMEM;
+		goto err_unreg;
+	}
+
+	/* detect transition latency */
+	policy->cpuinfo.transition_latency = 0;
+	for (i=0; i<data->acpi_data.state_count; i++) {
+		if ((data->acpi_data.states[i].transition_latency * 1000) >
+		    policy->cpuinfo.transition_latency) {
+			policy->cpuinfo.transition_latency =
+			    data->acpi_data.states[i].transition_latency * 1000;
+		}
+	}
+	policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
+
+	policy->cur = processor_get_freq(data, policy->cpu);
+
+	/* table init */
+	for (i = 0; i <= data->acpi_data.state_count; i++)
+	{
+		data->freq_table[i].index = i;
+		if (i < data->acpi_data.state_count) {
+			data->freq_table[i].frequency =
+			      data->acpi_data.states[i].core_frequency * 1000;
+		} else {
+			data->freq_table[i].frequency = CPUFREQ_TABLE_END;
+		}
+	}
+
+	result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
+	if (result) {
+		goto err_freqfree;
+	}
+
+	/* notify BIOS that we exist */
+	acpi_processor_notify_smm(THIS_MODULE);
+
+	printk(KERN_INFO "acpi-cpufreq: CPU%u - ACPI performance management "
+	       "activated.\n", cpu);
+
+	for (i = 0; i < data->acpi_data.state_count; i++)
+		dprintk("     %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n",
+			(i == data->acpi_data.state?'*':' '), i,
+			(u32) data->acpi_data.states[i].core_frequency,
+			(u32) data->acpi_data.states[i].power,
+			(u32) data->acpi_data.states[i].transition_latency,
+			(u32) data->acpi_data.states[i].bus_master_latency,
+			(u32) data->acpi_data.states[i].status,
+			(u32) data->acpi_data.states[i].control);
+
+	cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
+
+	/* the first call to ->target() should result in us actually
+	 * writing something to the appropriate registers. */
+	data->resume = 1;
+
+	return (result);
+
+ err_freqfree:
+	kfree(data->freq_table);
+ err_unreg:
+	acpi_processor_unregister_performance(&data->acpi_data, cpu);
+ err_free:
+	kfree(data);
+	acpi_io_data[cpu] = NULL;
+
+	return (result);
+}
+
+
+static int
+acpi_cpufreq_cpu_exit (
+	struct cpufreq_policy   *policy)
+{
+	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
+
+	dprintk("acpi_cpufreq_cpu_exit\n");
+
+	if (data) {
+		cpufreq_frequency_table_put_attr(policy->cpu);
+		acpi_io_data[policy->cpu] = NULL;
+		acpi_processor_unregister_performance(&data->acpi_data,
+		                                      policy->cpu);
+		kfree(data);
+	}
+
+	return (0);
+}
+
+
+static struct freq_attr* acpi_cpufreq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+
+static struct cpufreq_driver acpi_cpufreq_driver = {
+	.verify 	= acpi_cpufreq_verify,
+	.target 	= acpi_cpufreq_target,
+	.get 		= acpi_cpufreq_get,
+	.init		= acpi_cpufreq_cpu_init,
+	.exit		= acpi_cpufreq_cpu_exit,
+	.name		= "acpi-cpufreq",
+	.owner		= THIS_MODULE,
+	.attr           = acpi_cpufreq_attr,
+};
+
+
+static int __init
+acpi_cpufreq_init (void)
+{
+	dprintk("acpi_cpufreq_init\n");
+
+ 	return cpufreq_register_driver(&acpi_cpufreq_driver);
+}
+
+
+static void __exit
+acpi_cpufreq_exit (void)
+{
+	dprintk("acpi_cpufreq_exit\n");
+
+	cpufreq_unregister_driver(&acpi_cpufreq_driver);
+	return;
+}
+
+
+late_initcall(acpi_cpufreq_init);
+module_exit(acpi_cpufreq_exit);
+
diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h
index 4c06d455139c..3a544ffc5008 100644
--- a/include/asm-ia64/acpi.h
+++ b/include/asm-ia64/acpi.h
@@ -116,6 +116,11 @@ extern int __initdata nid_to_pxm_map[MAX_NUMNODES];
 
 extern u16 ia64_acpiid_to_sapicid[];
 
+/*
+ * Refer Intel ACPI _PDC support document for bit definitions
+ */
+#define ACPI_PDC_EST_CAPABILITY_SMP     0x8
+
 #endif /*__KERNEL__*/
 
 #endif /*_ASM_ACPI_H*/
diff --git a/include/asm-ia64/pal.h b/include/asm-ia64/pal.h
index 2303a10ee595..e828377ad295 100644
--- a/include/asm-ia64/pal.h
+++ b/include/asm-ia64/pal.h
@@ -75,6 +75,8 @@
 #define PAL_CACHE_READ		259	/* read tag & data of cacheline for diagnostic testing */
 #define PAL_CACHE_WRITE		260	/* write tag & data of cacheline for diagnostic testing */
 #define PAL_VM_TR_READ		261	/* read contents of translation register */
+#define PAL_GET_PSTATE		262	/* get the current P-state */
+#define PAL_SET_PSTATE		263	/* set the P-state */
 
 #ifndef __ASSEMBLY__
 
@@ -1111,6 +1113,25 @@ ia64_pal_halt_info (pal_power_mgmt_info_u_t *power_buf)
 	return iprv.status;
 }
 
+/* Get the current P-state information */
+static inline s64
+ia64_pal_get_pstate (u64 *pstate_index)
+{
+	struct ia64_pal_retval iprv;
+	PAL_CALL_STK(iprv, PAL_GET_PSTATE, 0, 0, 0);
+	*pstate_index = iprv.v0;
+	return iprv.status;
+}
+
+/* Set the P-state */
+static inline s64
+ia64_pal_set_pstate (u64 pstate_index)
+{
+	struct ia64_pal_retval iprv;
+	PAL_CALL_STK(iprv, PAL_SET_PSTATE, pstate_index, 0, 0);
+	return iprv.status;
+}
+
 /* Cause the processor to enter LIGHT HALT state, where prefetching and execution are
  * suspended, but cache and TLB coherency is maintained.
  */
-- 
cgit v1.2.3


From f8220c7f15c54aefb4976ad7c6e2aa98b27a8ce8 Mon Sep 17 00:00:00 2001
From: Kenneth Chen <kenneth.w.chen@intel.com>
Date: Fri, 26 Aug 2005 14:57:00 -0700
Subject: [IA64] Delete erroneous copy_page.o in global lib-y list

copy_page.o appeared twice in arch/ia64/lib/Makefile. The
one in global lib-y is wrong where it should be just in
lib-$(CONFIG_ITANIUM).

Both copy_page.o and copy_page_mck.o are build for Itanium2
processor and the link order will pick up the low performing
copy_page function (originally written for itanium processor).
In this case, we really want the copy_page_mck.o for optimized
version.

Signed-off-by: Kenneth Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/lib/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 1902c3c2ef92..799407e7726f 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -6,7 +6,7 @@ obj-y := io.o
 
 lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
 	__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o			\
-	bitop.o checksum.o clear_page.o csum_partial_copy.o copy_page.o	\
+	bitop.o checksum.o clear_page.o csum_partial_copy.o		\
 	clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o	\
 	flush.o ip_fast_csum.o do_csum.o				\
 	memset.o strlen.o swiotlb.o
-- 
cgit v1.2.3


From a994018a5fc987702dfb4f5d31172842ea6186dc Mon Sep 17 00:00:00 2001
From: Martin Hicks <mort@sgi.com>
Date: Wed, 31 Aug 2005 11:21:00 -0700
Subject: [IA64] uncached allocator: use generic (not sn2 specific) functions

Change sn2-specific calls into generic functions.  Without this change
the uncached allocator will not work on non-sn2 platforms.

Signed-off-by: Greg Edwards <edwardsg@sgi.com>
Signed-off-by: Martin Hicks <mort@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/uncached.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 490dfc9ab47f..4e9d06c48a8b 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -184,7 +184,7 @@ uncached_free_page(unsigned long maddr)
 {
 	int node;
 
-	node = nasid_to_cnodeid(NASID_GET(maddr));
+	node = paddr_to_nid(maddr - __IA64_UNCACHED_OFFSET);
 
 	dprintk(KERN_DEBUG "uncached_free_page(%lx) on node %i\n", maddr, node);
 
@@ -217,7 +217,7 @@ uncached_build_memmap(unsigned long start, unsigned long end, void *arg)
 
 	memset((char *)vstart, 0, length);
 
-	node = nasid_to_cnodeid(NASID_GET(start));
+	node = paddr_to_nid(start);
 
 	for (; vstart < vend ; vstart += PAGE_SIZE) {
 		dprintk(KERN_INFO "sticking %lx into the pool!\n", vstart);
-- 
cgit v1.2.3


From 54d5d42404e7705cf3804593189e963350d470e5 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 6 Sep 2005 15:16:15 -0700
Subject: [PATCH] x86/x86_64: deferred handling of writes to
 /proc/irqxx/smp_affinity

When handling writes to /proc/irq, current code is re-programming rte
entries directly. This is not recommended and could potentially cause
chipset's to lockup, or cause missing interrupts.

CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the
interrupt is pending. The same needs to be done for /proc/irq handling as well.
Otherwise user space irq balancers are really not doing the right thing.

- Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for
  lack of a generic name.
- added move_irq out of IRQ_BALANCE, and added this same to X86_64
- Added new proc handler for write, so we can do deferred write at irq
  handling time.
- Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead
  it now shows only active cpu masks, or exactly what was set.
- Provided a common move_irq implementation, instead of duplicating
  when using generic irq framework.

Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off.
Tested UP builds as well.

MSI testing: tbd: I have cards, need to look for a x-over cable, although I
did test an earlier version of this patch.  Will test in a couple days.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Zwane Mwaikambo <zwane@holomorphy.com>
Grudgingly-acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig            |   5 ++
 arch/i386/kernel/io_apic.c   |  55 ++++++++++---------
 arch/ia64/Kconfig            |   5 ++
 arch/ia64/kernel/irq.c       |  39 +-------------
 arch/x86_64/Kconfig          |   5 ++
 arch/x86_64/kernel/io_apic.c | 102 ++++++++++++++++++++++-------------
 drivers/pci/msi.c            |  17 ++----
 drivers/pci/msi.h            |   5 --
 include/asm-ia64/hw_irq.h    |   7 ---
 include/asm-ia64/irq.h       |   6 ---
 include/linux/irq.h          | 123 +++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c          |   4 ++
 kernel/irq/proc.c            |  14 ++++-
 13 files changed, 253 insertions(+), 134 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3b3b017e1c15..4b7de3e1e57b 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1318,6 +1318,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 config X86_SMP
 	bool
 	depends on SMP && !X86_VOYAGER
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 6578f40bd501..4a5940431579 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -33,6 +33,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
+
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
@@ -222,13 +223,21 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 {
 	unsigned long flags;
 	int pin;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
+	cpumask_t tmp;
 	
+	cpus_and(tmp, cpumask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
 	apicid_value = cpu_mask_to_apicid(cpumask);
 	/* Prepare to do the io_apic_write */
 	apicid_value = apicid_value << 24;
@@ -242,6 +251,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 			break;
 		entry = irq_2_pin + entry->next;
 	}
+	set_irq_info(irq, cpumask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -259,7 +269,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 #  define Dprintk(x...) 
 # endif
 
-cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
 
 #define IRQBALANCE_CHECK_ARCH -999
 static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
@@ -328,12 +337,7 @@ static inline void balance_irq(int cpu, int irq)
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
-		irq_desc_t *desc = irq_desc + irq;
-		unsigned long flags;
-
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
 	}
 }
 
@@ -528,16 +532,12 @@ tryanotherirq:
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
 	if (!cpus_empty(tmp)) {
-		irq_desc_t *desc = irq_desc + selected_irq;
-		unsigned long flags;
 
 		Dprintk("irq = %d moved to cpu = %d\n",
 				selected_irq, min_loaded);
 		/* mark for change destination */
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[selected_irq] =
-					cpumask_of_cpu(min_loaded);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
 		/* Since we made a change, come back sooner to 
 		 * check for more variation.
 		 */
@@ -568,7 +568,8 @@ static int balanced_irq(void *unused)
 	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++) {
-		pending_irq_balance_cpumask[i] = cpumask_of_cpu(0);
+		pending_irq_cpumask[i] = cpumask_of_cpu(0);
+		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
 	for ( ; ; ) {
@@ -647,20 +648,9 @@ int __init irqbalance_disable(char *str)
 
 __setup("noirqbalance", irqbalance_disable);
 
-static inline void move_irq(int irq)
-{
-	/* note - we hold the desc->lock */
-	if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
-		set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
-		cpus_clear(pending_irq_balance_cpumask[irq]);
-	}
-}
-
 late_initcall(balanced_irq_init);
-
-#else /* !CONFIG_IRQBALANCE */
-static inline void move_irq(int irq) { }
 #endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
 
 #ifndef CONFIG_SMP
 void fastcall send_IPI_self(int vector)
@@ -820,6 +810,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -838,6 +829,7 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
 
 /*
  * EISA Edge/Level control register, ELCR
@@ -1249,6 +1241,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1944,6 +1937,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1958,6 +1952,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1975,14 +1970,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
 #endif
+#endif
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -2000,7 +1998,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -2011,7 +2011,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -2569,6 +2571,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3deced637f07..17b5dbf8c311 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -434,6 +434,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 source "arch/ia64/hp/sim/Kconfig"
 
 source "arch/ia64/oprofile/Kconfig"
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 28f2aadc38d0..205d98028261 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -91,23 +91,8 @@ skip:
 }
 
 #ifdef CONFIG_SMP
-/*
- * This is updated when the user sets irq affinity via /proc
- */
-static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
-static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
-
 static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 
-/*
- * Arch specific routine for deferred write to iosapic rte to reprogram
- * intr destination.
- */
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
-	pending_irq_cpumask[irq] = mask_val;
-}
-
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
 	cpumask_t mask = CPU_MASK_NONE;
@@ -116,32 +101,10 @@ void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 
 	if (irq < NR_IRQS) {
 		irq_affinity[irq] = mask;
+		set_irq_info(irq, mask);
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
-
-
-void move_irq(int irq)
-{
-	/* note - we hold desc->lock */
-	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
-	int redir = test_bit(irq, pending_irq_redir);
-
-	if (unlikely(!desc->handler->set_affinity))
-		return;
-
-	if (!cpus_empty(pending_irq_cpumask[irq])) {
-		cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
-		if (unlikely(!cpus_empty(tmp))) {
-			desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0),
-						    pending_irq_cpumask[irq]);
-		}
-		cpus_clear(pending_irq_cpumask[irq]);
-	}
-}
-
-
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 75e52c57f19c..251ce7cf1a38 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -441,6 +441,11 @@ config ISA_DMA_API
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 menu "Power management options"
 
 source kernel/power/Kconfig
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d206d7e49cf5..76bcc4e6979d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -78,6 +78,54 @@ int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
 #define vector_to_irq(vector)	(vector)
 #endif
 
+#define __DO_ACTION(R, ACTION, FINAL)					\
+									\
+{									\
+	int pin;							\
+	struct irq_pin_list *entry = irq_2_pin + irq;			\
+									\
+	for (;;) {							\
+		unsigned int reg;					\
+		pin = entry->pin;					\
+		if (pin == -1)						\
+			break;						\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+		reg ACTION;						\
+		io_apic_modify(entry->apic, reg);			\
+		if (!entry->next)					\
+			break;						\
+		entry = irq_2_pin + entry->next;			\
+	}								\
+	FINAL;								\
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	unsigned long flags;
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	dest = cpu_mask_to_apicid(mask);
+
+	/*
+	 * Only the high 8 bits are valid.
+	 */
+	dest = SET_APIC_LOGICAL_ID(dest);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__DO_ACTION(1, = dest, )
+	set_irq_info(irq, mask);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -101,26 +149,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
-#define __DO_ACTION(R, ACTION, FINAL)					\
-									\
-{									\
-	int pin;							\
-	struct irq_pin_list *entry = irq_2_pin + irq;			\
-									\
-	for (;;) {							\
-		unsigned int reg;					\
-		pin = entry->pin;					\
-		if (pin == -1)						\
-			break;						\
-		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
-		reg ACTION;						\
-		io_apic_modify(entry->apic, reg);			\
-		if (!entry->next)					\
-			break;						\
-		entry = irq_2_pin + entry->next;			\
-	}								\
-	FINAL;								\
-}
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -767,6 +795,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1314,6 +1343,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
  */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
+	move_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1343,26 +1373,10 @@ static unsigned int startup_level_ioapic_irq (unsigned int irq)
 
 static void end_level_ioapic_irq (unsigned int irq)
 {
+	move_irq(irq);
 	ack_APIC_irq();
 }
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-{
-	unsigned long flags;
-	unsigned int dest;
-
-	dest = cpu_mask_to_apicid(mask);
-
-	/*
-	 * Only the high 8 bits are valid.
-	 */
-	dest = SET_APIC_LOGICAL_ID(dest);
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1375,6 +1389,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1389,6 +1404,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1406,14 +1422,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
-#endif
+#endif // CONFIG_SMP
+#endif // CONFIG_PCI_MSI
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1432,7 +1451,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -1443,7 +1464,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -1918,6 +1941,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
@@ -1931,6 +1955,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -1949,3 +1974,4 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2b85aa39f954..532f73bb2224 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -91,6 +91,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 {
 	struct msi_desc *entry;
 	struct msg_address address;
+	unsigned int irq = vector;
 
 	entry = (struct msi_desc *)msi_desc[vector];
 	if (!entry || !entry->dev)
@@ -112,6 +113,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		pci_write_config_dword(entry->dev, msi_lower_address_reg(pos),
 			address.lo_address.value);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	case PCI_CAP_ID_MSIX:
@@ -125,22 +127,13 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		writel(address.lo_address.value, entry->mask_base + offset);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	default:
 		break;
 	}
 }
-
-#ifdef CONFIG_IRQBALANCE
-static inline void move_msi(int vector)
-{
-	if (!cpus_empty(pending_irq_balance_cpumask[vector])) {
-		set_msi_affinity(vector, pending_irq_balance_cpumask[vector]);
-		cpus_clear(pending_irq_balance_cpumask[vector]);
-	}
-}
-#endif /* CONFIG_IRQBALANCE */
 #endif /* CONFIG_SMP */
 
 static void mask_MSI_irq(unsigned int vector)
@@ -191,13 +184,13 @@ static void shutdown_msi_irq(unsigned int vector)
 
 static void end_msi_irq_wo_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	ack_APIC_irq();
 }
 
 static void end_msi_irq_w_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	unmask_MSI_irq(vector);
 	ack_APIC_irq();
 }
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 390f1851c0f1..402136a5c9e4 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -19,7 +19,6 @@
 #define NR_HP_RESERVED_VECTORS 	20
 
 extern int vector_irq[NR_VECTORS];
-extern cpumask_t pending_irq_balance_cpumask[NR_IRQS];
 extern void (*interrupt[NR_IRQS])(void);
 extern int pci_vector_resources(int last, int nr_released);
 
@@ -29,10 +28,6 @@ extern int pci_vector_resources(int last, int nr_released);
 #define set_msi_irq_affinity	NULL
 #endif
 
-#ifndef CONFIG_IRQBALANCE
-static inline void move_msi(int vector) {}
-#endif
-
 /*
  * MSI-X Address Register
  */
diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h
index 041ab8c51a64..0cf119b42f7d 100644
--- a/include/asm-ia64/hw_irq.h
+++ b/include/asm-ia64/hw_irq.h
@@ -116,13 +116,6 @@ __ia64_local_vector_to_irq (ia64_vector vec)
  * and to obtain the irq descriptor for a given irq number.
  */
 
-/* Return a pointer to the irq descriptor for IRQ.  */
-static inline irq_desc_t *
-irq_descp (int irq)
-{
-	return irq_desc + irq;
-}
-
 /* Extract the IA-64 vector that corresponds to IRQ.  */
 static inline ia64_vector
 irq_to_vector (int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index bd07d11d9f37..5d930fdc0bea 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,12 +30,6 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
-#ifdef CONFIG_SMP
-extern void move_irq(int irq);
-#else
-#define move_irq(irq)
-#endif
-
 struct irqaction;
 struct pt_regs;
 int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 069d3b84d311..4a362b9ec966 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,16 +71,139 @@ typedef struct irq_desc {
 	unsigned int irq_count;		/* For detecting broken interrupts */
 	unsigned int irqs_unhandled;
 	spinlock_t lock;
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+#endif
 } ____cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc [NR_IRQS];
 
+/* Return a pointer to the irq descriptor for IRQ.  */
+static inline irq_desc_t *
+irq_descp (int irq)
+{
+	return irq_desc + irq;
+}
+
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
 extern int setup_irq(unsigned int irq, struct irqaction * new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern cpumask_t irq_affinity[NR_IRQS];
+
+#ifdef CONFIG_SMP
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+	irq_affinity[irq] = mask;
+}
+#else
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+extern cpumask_t pending_irq_cpumask[NR_IRQS];
+
+static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->move_irq = 1;
+	pending_irq_cpumask[irq] = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static inline void
+move_native_irq(int irq)
+{
+	cpumask_t tmp;
+	irq_desc_t *desc = irq_descp(irq);
+
+	if (likely (!desc->move_irq))
+		return;
+
+	desc->move_irq = 0;
+
+	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+		return;
+
+	if (!desc->handler->set_affinity)
+		return;
+
+	/* note - we hold the desc->lock */
+	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 */
+	if (unlikely(!cpus_empty(tmp))) {
+		desc->handler->disable(irq);
+		desc->handler->set_affinity(irq,tmp);
+		desc->handler->enable(irq);
+	}
+	cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#ifdef CONFIG_PCI_MSI
+/*
+ * Wonder why these are dummies?
+ * For e.g the set_ioapic_affinity_vector() calls the set_ioapic_affinity_irq()
+ * counter part after translating the vector to irq info. We need to perform
+ * this operation on the real irq, when we dont use vector, i.e when
+ * pci_use_vector() is false.
+ */
+static inline void move_irq(int irq)
+{
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+}
+
+#else // CONFIG_PCI_MSI
+
+static inline void move_irq(int irq)
+{
+	move_native_irq(irq);
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+#endif // CONFIG_PCI_MSI
+
+#else	// CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE
+
+#define move_irq(x)
+#define move_native_irq(x)
+#define set_pending_irq(x,y)
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+
+#endif // CONFIG_GENERIC_PENDING_IRQ
+
+#else // CONFIG_SMP
+
+#define move_irq(x)
+#define move_native_irq(x)
+
+#endif // CONFIG_SMP
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 
-void __attribute__((weak))
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	/*
+	 * Save these away for later use. Re-progam when the
+	 * interrupt is pending
+	 */
+	set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
-- 
cgit v1.2.3


From f68f447e8389de9a62e3e80c3c5823cce484c2e5 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 6 Sep 2005 15:18:06 -0700
Subject: [PATCH] ia64 cpuset + build_sched_domains() mangles structures

I've already sent this to the maintainers, and this is now being sent to a
larger community audience.  I have fixed a problem with the ia64 version of
build_sched_domains(), but a similar fix still needs to be made to the
generic build_sched_domains() in kernel/sched.c.

The "dynamic sched domains" functionality has recently been merged into
2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a.
"isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched
groups to separate away the CPUs in this cpu-exclusive cpuset from the
remainder of the non-isolated CPUs.  This allows the non-isolated CPUs to
completely ignore the isolated CPUs when doing load-balancing.

Unfortunately, build_sched_domains() expects that a sched domain will
include all the CPUs of each node in the domain, i.e., that no node will
belong in both an isolated cpuset and a non-isolated cpuset.  Declaring a
cpuset that violates this presumption will produce flawed data structures
and will oops the kernel.

To trigger the problem (on a NUMA system with >1 CPUs per node):
   cd /dev/cpuset
   mkdir newcpuset
   cd newcpuset
   echo 0 >cpus
   echo 0 >mems
   echo 1 >cpu_exclusive

I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node).
A similar shortcoming exists in the generic build_sched_domains() (in
kernel/sched.c) for NUMA, and that needs to be fixed also.  The fix
involves dynamically allocating sched_group_nodes[] and
sched_group_allnodes[] for each invocation of build_sched_domains(), rather
than using global arrays for these structures.  Care must be taken to
remember kmalloc() addresses so that arch_destroy_sched_domains() can
properly kfree() the new dynamic structures.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/domain.c | 90 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 21 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
index bbb8efe126b7..e907109983f1 100644
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@@ -120,10 +120,10 @@ static int cpu_to_phys_group(int cpu)
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 
 static int cpu_to_allnodes_group(int cpu)
 {
@@ -138,6 +138,21 @@ static int cpu_to_allnodes_group(int cpu)
 void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+#ifdef CONFIG_NUMA
+	struct sched_group **sched_group_nodes = NULL;
+	struct sched_group *sched_group_allnodes = NULL;
+
+	/*
+	 * Allocate the per-node list of sched groups
+	 */
+	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+					   GFP_ATOMIC);
+	if (!sched_group_nodes) {
+		printk(KERN_WARNING "Can not alloc sched group node list\n");
+		return;
+	}
+	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
 
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
@@ -150,8 +165,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-		if (num_online_cpus()
+		if (cpus_weight(*cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			if (!sched_group_allnodes) {
+				sched_group_allnodes
+					= kmalloc(sizeof(struct sched_group)
+							* MAX_NUMNODES,
+						  GFP_KERNEL);
+				if (!sched_group_allnodes) {
+					printk(KERN_WARNING
+					"Can not alloc allnodes sched group\n");
+					break;
+				}
+				sched_group_allnodes_bycpu[i]
+						= sched_group_allnodes;
+			}
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
@@ -214,8 +242,9 @@ void build_sched_domains(const cpumask_t *cpu_map)
 	}
 
 #ifdef CONFIG_NUMA
-	init_sched_build_groups(sched_group_allnodes, *cpu_map,
-				&cpu_to_allnodes_group);
+	if (sched_group_allnodes)
+		init_sched_build_groups(sched_group_allnodes, *cpu_map,
+					&cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
@@ -226,8 +255,10 @@ void build_sched_domains(const cpumask_t *cpu_map)
 		int j;
 
 		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
+		if (cpus_empty(nodemask)) {
+			sched_group_nodes[i] = NULL;
 			continue;
+		}
 
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
@@ -372,25 +403,42 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_NUMA
 	int i;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-		struct sched_group *oldsg, *sg = sched_group_nodes[i];
+	int cpu;
 
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
 
-		if (sg == NULL)
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
+
+		if (!sched_group_nodes)
 			continue;
-		sg = sg->next;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
 next_sg:
-		oldsg = sg;
-		sg = sg->next;
-		kfree(oldsg);
-		if (oldsg != sched_group_nodes[i])
-			goto next_sg;
-		sched_group_nodes[i] = NULL;
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 #endif
 }
-
-- 
cgit v1.2.3


From 9c1cfda20a508b181bdda8c0045f7c0c333880a5 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Tue, 6 Sep 2005 15:18:14 -0700
Subject: [PATCH] cpusets: Move the ia64 domain setup code to the generic code

Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/Makefile    |   2 +-
 arch/ia64/kernel/domain.c    | 444 -------------------------------------------
 include/asm-ia64/processor.h |   3 -
 include/asm-ia64/topology.h  |  23 ---
 include/linux/sched.h        |   7 -
 include/linux/topology.h     |  23 +++
 kernel/sched.c               | 290 ++++++++++++++++++++++------
 7 files changed, 260 insertions(+), 532 deletions(-)
 delete mode 100644 arch/ia64/kernel/domain.c

(limited to 'arch/ia64')

diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index b242594be55b..307514f7a282 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
 obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
deleted file mode 100644
index e907109983f1..000000000000
--- a/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * arch/ia64/kernel/domain.c
- * Architecture specific sched-domains builder.
- *
- * Copyright (C) 2004 Jesse Barnes
- * Copyright (C) 2004 Silicon Graphics, Inc.
- */
-
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
-
-		if (!nr_cpus_node(n))
-			continue;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span, nodemask;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	nodemask = node_to_cpumask(node);
-	cpus_or(span, span, nodemask);
-	set_bit(node, used_nodes);
-
-	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#endif
-
-/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-
-static int cpu_to_allnodes_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-void build_sched_domains(const cpumask_t *cpu_map)
-{
-	int i;
-#ifdef CONFIG_NUMA
-	struct sched_group **sched_group_nodes = NULL;
-	struct sched_group *sched_group_allnodes = NULL;
-
-	/*
-	 * Allocate the per-node list of sched groups
-	 */
-	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-					   GFP_ATOMIC);
-	if (!sched_group_nodes) {
-		printk(KERN_WARNING "Can not alloc sched group node list\n");
-		return;
-	}
-	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
-#endif
-
-	/*
-	 * Set up domains for cpus specified by the cpu_map.
-	 */
-	for_each_cpu_mask(i, *cpu_map) {
-		int group;
-		struct sched_domain *sd = NULL, *p;
-		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-
-#ifdef CONFIG_NUMA
-		if (cpus_weight(*cpu_map)
-				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-			if (!sched_group_allnodes) {
-				sched_group_allnodes
-					= kmalloc(sizeof(struct sched_group)
-							* MAX_NUMNODES,
-						  GFP_KERNEL);
-				if (!sched_group_allnodes) {
-					printk(KERN_WARNING
-					"Can not alloc allnodes sched group\n");
-					break;
-				}
-				sched_group_allnodes_bycpu[i]
-						= sched_group_allnodes;
-			}
-			sd = &per_cpu(allnodes_domains, i);
-			*sd = SD_ALLNODES_INIT;
-			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i);
-			sd->groups = &sched_group_allnodes[group];
-			p = sd;
-		} else
-			p = NULL;
-
-		sd = &per_cpu(node_domains, i);
-		*sd = SD_NODE_INIT;
-		sd->span = sched_domain_node_span(cpu_to_node(i));
-		sd->parent = p;
-		cpus_and(sd->span, sd->span, *cpu_map);
-#endif
-
-		p = sd;
-		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i);
-		*sd = SD_CPU_INIT;
-		sd->span = nodemask;
-		sd->parent = p;
-		sd->groups = &sched_group_phys[group];
-
-#ifdef CONFIG_SCHED_SMT
-		p = sd;
-		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i);
-		*sd = SD_SIBLING_INIT;
-		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, *cpu_map);
-		sd->parent = p;
-		sd->groups = &sched_group_cpus[group];
-#endif
-	}
-
-#ifdef CONFIG_SCHED_SMT
-	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-		if (i != first_cpu(this_sibling_map))
-			continue;
-
-		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-						&cpu_to_cpu_group);
-	}
-#endif
-
-	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		init_sched_build_groups(sched_group_phys, nodemask,
-						&cpu_to_phys_group);
-	}
-
-#ifdef CONFIG_NUMA
-	if (sched_group_allnodes)
-		init_sched_build_groups(sched_group_allnodes, *cpu_map,
-					&cpu_to_allnodes_group);
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Set up node groups */
-		struct sched_group *sg, *prev;
-		cpumask_t nodemask = node_to_cpumask(i);
-		cpumask_t domainspan;
-		cpumask_t covered = CPU_MASK_NONE;
-		int j;
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask)) {
-			sched_group_nodes[i] = NULL;
-			continue;
-		}
-
-		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, *cpu_map);
-
-		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, nodemask) {
-			struct sched_domain *sd;
-			sd = &per_cpu(node_domains, j);
-			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
-		}
-		sg->cpu_power = 0;
-		sg->cpumask = nodemask;
-		cpus_or(covered, covered, nodemask);
-		prev = sg;
-
-		for (j = 0; j < MAX_NUMNODES; j++) {
-			cpumask_t tmp, notcovered;
-			int n = (i + j) % MAX_NUMNODES;
-
-			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, *cpu_map);
-			cpus_and(tmp, tmp, domainspan);
-			if (cpus_empty(tmp))
-				break;
-
-			nodemask = node_to_cpumask(n);
-			cpus_and(tmp, tmp, nodemask);
-			if (cpus_empty(tmp))
-				continue;
-
-			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-			if (!sg) {
-				printk(KERN_WARNING
-				"Can not alloc domain group for node %d\n", j);
-				break;
-			}
-			sg->cpu_power = 0;
-			sg->cpumask = tmp;
-			cpus_or(covered, covered, tmp);
-			prev->next = sg;
-			prev = sg;
-		}
-		prev->next = sched_group_nodes[i];
-	}
-#endif
-
-	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, *cpu_map) {
-		int power;
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
-
-		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-		sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
-		sd = &per_cpu(allnodes_domains, i);
-		if (sd->groups) {
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-			sd->groups->cpu_power = power;
-		}
-#endif
-	}
-
-#ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *sg = sched_group_nodes[i];
-		int j;
-
-		if (sg == NULL)
-			continue;
-next_sg:
-		for_each_cpu_mask(j, sg->cpumask) {
-			struct sched_domain *sd;
-			int power;
-
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
-				/*
-				 * Only add "power" once for each
-				 * physical package.
-				 */
-				continue;
-			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-
-			sg->cpu_power += power;
-		}
-		sg = sg->next;
-		if (sg != sched_group_nodes[i])
-			goto next_sg;
-	}
-#endif
-
-	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-#else
-		sd = &per_cpu(phys_domains, i);
-#endif
-		cpu_attach_domain(sd, i);
-	}
-}
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-void arch_init_sched_domains(const cpumask_t *cpu_map)
-{
-	cpumask_t cpu_default_map;
-
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-
-	build_sched_domains(&cpu_default_map);
-}
-
-void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-{
-#ifdef CONFIG_NUMA
-	int i;
-	int cpu;
-
-	for_each_cpu_mask(cpu, *cpu_map) {
-		struct sched_group *sched_group_allnodes
-			= sched_group_allnodes_bycpu[cpu];
-		struct sched_group **sched_group_nodes
-			= sched_group_nodes_bycpu[cpu];
-
-		if (sched_group_allnodes) {
-			kfree(sched_group_allnodes);
-			sched_group_allnodes_bycpu[cpu] = NULL;
-		}
-
-		if (!sched_group_nodes)
-			continue;
-
-		for (i = 0; i < MAX_NUMNODES; i++) {
-			cpumask_t nodemask = node_to_cpumask(i);
-			struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-			cpus_and(nodemask, nodemask, *cpu_map);
-			if (cpus_empty(nodemask))
-				continue;
-
-			if (sg == NULL)
-				continue;
-			sg = sg->next;
-next_sg:
-			oldsg = sg;
-			sg = sg->next;
-			kfree(oldsg);
-			if (oldsg != sched_group_nodes[i])
-				goto next_sg;
-		}
-		kfree(sched_group_nodes);
-		sched_group_nodes_bycpu[cpu] = NULL;
-	}
-#endif
-}
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 91bbd1f22461..94e07e727395 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
 
-/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
-#define ARCH_HAS_SCHED_DOMAIN
-
 #define IA64_NUM_DBG_REGS	8
 /*
  * Limits for PMC and PMD are set to less than maximum architected values
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 399bc29729fd..a9f738bf18a7 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -98,29 +98,6 @@ void build_cpu_to_node_map(void);
 	.nr_balance_failed	= 0,			\
 }
 
-/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 64,			\
-	.max_interval		= 64*num_online_cpus(),	\
-	.busy_factor		= 128,			\
-	.imbalance_pct		= 133,			\
-	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #endif /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5a22ea80045..ea1b5f32ec5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -564,13 +564,6 @@ struct sched_domain {
 
 extern void partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
-#ifdef ARCH_HAS_SCHED_DOMAIN
-/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
-extern cpumask_t cpu_isolated_map;
-extern void init_sched_build_groups(struct sched_group groups[],
-	                        cpumask_t span, int (*group_fn)(int cpu));
-extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-#endif /* ARCH_HAS_SCHED_DOMAIN */
 #endif /* CONFIG_SMP */
 
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0320225e96da..3df1d474e5c5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -135,6 +135,29 @@
 }
 #endif
 
+/* sched_domains SD_ALLNODES_INIT for NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {	\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_LOAD_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 64,			\
+	.nr_balance_failed	= 0,			\
+}
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..50860ad5b624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-			cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
 	last->next = first;
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
  */
-static void check_sibling_maps(void)
-{
-	int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 
-	for_each_online_cpu(i) {
-		for_each_cpu_mask(j, cpu_sibling_map[i]) {
-			if (cpu_to_node(i) != cpu_to_node(j)) {
-				printk(KERN_INFO "warning: CPU %d siblings map "
-					"to different node - isolating "
-					"them.\n", i);
-				cpu_sibling_map[i] = cpumask_of_cpu(i);
-				break;
-			}
-		}
-	}
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
 }
 #endif
 
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void)
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+		if (num_online_cpus()
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
-		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = *cpu_map;
-		sd->groups = &sched_group_nodes[group];
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, *cpu_map,
-					&cpu_to_node_group);
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
+				&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 		sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-		if (i == first_cpu(sd->groups->cpumask)) {
-			/* Only add "power" once for each physical package. */
-			sd = &per_cpu(node_domains, i);
-			sd->groups->cpu_power += power;
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *sg = sched_group_nodes[i];
+		int j;
+
+		if (sg == NULL)
+			continue;
+next_sg:
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
+			int power;
+
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+
+			sg->cpu_power += power;
+		}
+		sg = sg->next;
+		if (sg != sched_group_nodes[i])
+			goto next_sg;
+	}
+#endif
+
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-	/* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+	int i;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+		struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		if (sg == NULL)
+			continue;
+		sg = sg->next;
+next_sg:
+		oldsg = sg;
+		sg = sg->next;
+		kfree(oldsg);
+		if (oldsg != sched_group_nodes[i])
+			goto next_sg;
+		sched_group_nodes[i] = NULL;
+	}
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map
-- 
cgit v1.2.3


From e922efc342d565a38eed3af377ff403f52148864 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 6 Sep 2005 15:18:25 -0700
Subject: [PATCH] remove duplicated sys_open32() code from 64bit archs

64 bit architectures all implement their own compatibility sys_open(),
when in fact the difference is simply not forcing the O_LARGEFILE
flag.  So use the a common function instead.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/ia32/ia32_entry.S       |  2 +-
 arch/ia64/ia32/sys_ia32.c         | 31 -------------------------------
 arch/ppc64/kernel/misc.S          |  2 +-
 arch/ppc64/kernel/sys_ppc32.c     | 31 -------------------------------
 arch/sparc64/kernel/sys_sparc32.c | 24 +-----------------------
 arch/x86_64/ia32/ia32entry.S      |  2 +-
 arch/x86_64/ia32/sys_ia32.c       | 26 --------------------------
 fs/compat.c                       | 10 ++++++++++
 fs/open.c                         | 19 +++++++++++--------
 include/linux/fs.h                |  1 +
 10 files changed, 26 insertions(+), 122 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/ia32/ia32_entry.S b/arch/ia64/ia32/ia32_entry.S
index 829a6d80711c..0708edb06cc4 100644
--- a/arch/ia64/ia32/ia32_entry.S
+++ b/arch/ia64/ia32/ia32_entry.S
@@ -215,7 +215,7 @@ ia32_syscall_table:
 	data8 sys32_fork
 	data8 sys_read
 	data8 sys_write
-	data8 sys32_open	  /* 5 */
+	data8 compat_sys_open	  /* 5 */
 	data8 sys_close
 	data8 sys32_waitpid
 	data8 sys_creat
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index c1e20d65dd6c..e29a8a55486a 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -2359,37 +2359,6 @@ sys32_brk (unsigned int brk)
 	return ret;
 }
 
-/*
- * Exactly like fs/open.c:sys_open(), except that it doesn't set the O_LARGEFILE flag.
- */
-asmlinkage long
-sys32_open (const char __user * filename, int flags, int mode)
-{
-	char * tmp;
-	int fd, error;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file *f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f))
-				goto out_error;
-			fd_install(fd, f);
-		}
-out:
-		putname(tmp);
-	}
-	return fd;
-
-out_error:
-	put_unused_fd(fd);
-	fd = error;
-	goto out;
-}
-
 /* Structure for ia32 emulation on ia64 */
 struct epoll_event32
 {
diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S
index 474df0a862bf..2164bd7b4ef6 100644
--- a/arch/ppc64/kernel/misc.S
+++ b/arch/ppc64/kernel/misc.S
@@ -957,7 +957,7 @@ _GLOBAL(sys_call_table32)
 	.llong .ppc_fork
 	.llong .sys_read
 	.llong .sys_write
-	.llong .sys32_open		/* 5 */
+	.llong .compat_sys_open		/* 5 */
 	.llong .sys_close
 	.llong .sys32_waitpid
 	.llong .sys32_creat
diff --git a/arch/ppc64/kernel/sys_ppc32.c b/arch/ppc64/kernel/sys_ppc32.c
index 206619080e66..214914a95a50 100644
--- a/arch/ppc64/kernel/sys_ppc32.c
+++ b/arch/ppc64/kernel/sys_ppc32.c
@@ -867,37 +867,6 @@ off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin)
 	return sys_lseek(fd, (int)offset, origin);
 }
 
-/*
- * This is just a version for 32-bit applications which does
- * not force O_LARGEFILE on.
- */
-asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
-{
-	char * tmp;
-	int fd, error;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file * f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f))
-				goto out_error;
-			fd_install(fd, f);
-		}
-out:
-		putname(tmp);
-	}
-	return fd;
-
-out_error:
-	put_unused_fd(fd);
-	fd = error;
-	goto out;
-}
-
 /* Note: it is necessary to treat bufsiz as an unsigned int,
  * with the corresponding cast to a signed int to insure that the 
  * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 1d3aa588df8a..7f6239ed2521 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -1002,29 +1002,7 @@ asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
 asmlinkage long sparc32_open(const char __user *filename,
 			     int flags, int mode)
 {
-	char * tmp;
-	int fd, error;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file * f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f))
-				goto out_error;
-			fd_install(fd, f);
-		}
-out:
-		putname(tmp);
-	}
-	return fd;
-
-out_error:
-	put_unused_fd(fd);
-	fd = error;
-	goto out;
+	return do_sys_open(filename, flags, mode);
 }
 
 extern unsigned long do_mremap(unsigned long addr,
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index c45d6a05b984..f174083d5567 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -307,7 +307,7 @@ ia32_sys_call_table:
 	.quad stub32_fork
 	.quad sys_read
 	.quad sys_write
-	.quad sys32_open		/* 5 */
+	.quad compat_sys_open		/* 5 */
 	.quad sys_close
 	.quad sys32_waitpid
 	.quad sys_creat
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index be996d1b691e..04d80406ce4f 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -969,32 +969,6 @@ long sys32_kill(int pid, int sig)
 	return sys_kill(pid, sig);
 }
  
-asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
-{
-	char * tmp;
-	int fd, error;
-
-	/* don't force O_LARGEFILE */
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		fd = get_unused_fd();
-		if (fd >= 0) {
-			struct file *f = filp_open(tmp, flags, mode);
-			error = PTR_ERR(f);
-			if (IS_ERR(f)) {
-				put_unused_fd(fd); 
-				fd = error;
-			} else {
-				fsnotify_open(f->f_dentry);
-				fd_install(fd, f);
-			}
-		}
-		putname(tmp);
-	}
-	return fd;
-}
-
 extern asmlinkage long
 sys_timer_create(clockid_t which_clock,
 		 struct sigevent __user *timer_event_spec,
diff --git a/fs/compat.c b/fs/compat.c
index 2eb03c49b07c..8c665705c6a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1274,6 +1274,16 @@ out:
 	return ret;
 }
 
+/*
+ * Exactly like fs/open.c:sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open(const char __user *filename, int flags, int mode)
+{
+	return do_sys_open(filename, flags, mode);
+}
+
 /*
  * compat_count() counts the number of arguments/envelopes. It is basically
  * a copy of count() from fs/exec.c, except that it works with 32 bit argv
diff --git a/fs/open.c b/fs/open.c
index 32bf05e2996d..4ee2dcc31c28 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -933,16 +933,11 @@ void fastcall fd_install(unsigned int fd, struct file * file)
 
 EXPORT_SYMBOL(fd_install);
 
-asmlinkage long sys_open(const char __user * filename, int flags, int mode)
+long do_sys_open(const char __user *filename, int flags, int mode)
 {
-	char * tmp;
-	int fd;
+	char *tmp = getname(filename);
+	int fd = PTR_ERR(tmp);
 
-	if (force_o_largefile())
-		flags |= O_LARGEFILE;
-
-	tmp = getname(filename);
-	fd = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd();
 		if (fd >= 0) {
@@ -959,6 +954,14 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode)
 	}
 	return fd;
 }
+
+asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+{
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	return do_sys_open(filename, flags, mode);
+}
 EXPORT_SYMBOL_GPL(sys_open);
 
 #ifndef __alpha__
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 57e294bf83f4..7e1b589842af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1280,6 +1280,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 /* fs/open.c */
 
 extern int do_truncate(struct dentry *, loff_t start);
+extern long do_sys_open(const char __user *filename, int flags, int mode);
 extern struct file *filp_open(const char *, int, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
 extern int filp_close(struct file *, fl_owner_t id);
-- 
cgit v1.2.3


From f96cb1f0580324b95b7219466312a376a59a796f Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 6 Sep 2005 15:18:31 -0700
Subject: [PATCH] IA64: convert kcalloc to kzalloc

This patch converts kcalloc(1, ...) calls to use the new kzalloc() function.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/sn/kernel/io_init.c     | 2 +-
 arch/ia64/sn/kernel/tiocx.c       | 2 +-
 arch/ia64/sn/pci/tioca_provider.c | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index 4564ed0b5ff3..906622d9f933 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -431,7 +431,7 @@ void sn_bus_store_sysdata(struct pci_dev *dev)
 {
 	struct sysdata_el *element;
 
-	element = kcalloc(1, sizeof(struct sysdata_el), GFP_KERNEL);
+	element = kzalloc(sizeof(struct sysdata_el), GFP_KERNEL);
 	if (!element) {
 		dev_dbg(dev, "%s: out of memory!\n", __FUNCTION__);
 		return;
diff --git a/arch/ia64/sn/kernel/tiocx.c b/arch/ia64/sn/kernel/tiocx.c
index 254fe15c064b..b45db5133f55 100644
--- a/arch/ia64/sn/kernel/tiocx.c
+++ b/arch/ia64/sn/kernel/tiocx.c
@@ -191,7 +191,7 @@ cx_device_register(nasid_t nasid, int part_num, int mfg_num,
 {
 	struct cx_dev *cx_dev;
 
-	cx_dev = kcalloc(1, sizeof(struct cx_dev), GFP_KERNEL);
+	cx_dev = kzalloc(sizeof(struct cx_dev), GFP_KERNEL);
 	DBG("cx_dev= 0x%p\n", cx_dev);
 	if (cx_dev == NULL)
 		return -ENOMEM;
diff --git a/arch/ia64/sn/pci/tioca_provider.c b/arch/ia64/sn/pci/tioca_provider.c
index ea09c12f0258..19bced34d5f1 100644
--- a/arch/ia64/sn/pci/tioca_provider.c
+++ b/arch/ia64/sn/pci/tioca_provider.c
@@ -148,7 +148,7 @@ tioca_gart_init(struct tioca_kernel *tioca_kern)
 	tioca_kern->ca_pcigart_entries =
 	    tioca_kern->ca_pciap_size / tioca_kern->ca_ap_pagesize;
 	tioca_kern->ca_pcigart_pagemap =
-	    kcalloc(1, tioca_kern->ca_pcigart_entries / 8, GFP_KERNEL);
+	    kzalloc(tioca_kern->ca_pcigart_entries / 8, GFP_KERNEL);
 	if (!tioca_kern->ca_pcigart_pagemap) {
 		free_pages((unsigned long)tioca_kern->ca_gart,
 			   get_order(tioca_kern->ca_gart_size));
@@ -392,7 +392,7 @@ tioca_dma_mapped(struct pci_dev *pdev, uint64_t paddr, size_t req_size)
 	 * allocate a map struct
 	 */
 
-	ca_dmamap = kcalloc(1, sizeof(struct tioca_dmamap), GFP_ATOMIC);
+	ca_dmamap = kzalloc(sizeof(struct tioca_dmamap), GFP_ATOMIC);
 	if (!ca_dmamap)
 		goto map_return;
 
@@ -600,7 +600,7 @@ tioca_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	 * Allocate kernel bus soft and copy from prom.
 	 */
 
-	tioca_common = kcalloc(1, sizeof(struct tioca_common), GFP_KERNEL);
+	tioca_common = kzalloc(sizeof(struct tioca_common), GFP_KERNEL);
 	if (!tioca_common)
 		return NULL;
 
@@ -609,7 +609,7 @@ tioca_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 
 	/* init kernel-private area */
 
-	tioca_kern = kcalloc(1, sizeof(struct tioca_kernel), GFP_KERNEL);
+	tioca_kern = kzalloc(sizeof(struct tioca_kernel), GFP_KERNEL);
 	if (!tioca_kern) {
 		kfree(tioca_common);
 		return NULL;
-- 
cgit v1.2.3


From 1f7ad57b75ab0fba27455c7344a6ab7aa6bd90c5 Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Tue, 6 Sep 2005 15:19:30 -0700
Subject: [PATCH] Kprobes: prevent possible race conditions ia64 changes

This patch contains the ia64 architecture specific changes to prevent the
possible race conditions.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/jprobes.S     |  1 +
 arch/ia64/kernel/kprobes.c     | 57 ++++++++++++++++++++++++------------------
 arch/ia64/kernel/traps.c       |  5 ++--
 arch/ia64/kernel/vmlinux.lds.S |  1 +
 arch/ia64/lib/flush.S          |  1 +
 arch/ia64/mm/fault.c           |  3 ++-
 6 files changed, 41 insertions(+), 27 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/kernel/jprobes.S b/arch/ia64/kernel/jprobes.S
index b7fa3ccd2b0f..2323377e3695 100644
--- a/arch/ia64/kernel/jprobes.S
+++ b/arch/ia64/kernel/jprobes.S
@@ -49,6 +49,7 @@
 	/*
 	 * void jprobe_break(void)
 	 */
+	.section .kprobes.text, "ax"
 ENTRY(jprobe_break)
 	break.m 0x80300
 END(jprobe_break)
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 884f5cd27d8a..82a41ac29386 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -87,8 +87,10 @@ static enum instruction_type bundle_encoding[32][3] = {
  * is IP relative instruction and update the kprobe
  * inst flag accordingly
  */
-static void update_kprobe_inst_flag(uint template, uint  slot, uint major_opcode,
-	unsigned long kprobe_inst, struct kprobe *p)
+static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
+					      uint major_opcode,
+					      unsigned long kprobe_inst,
+					      struct kprobe *p)
 {
 	p->ainsn.inst_flag = 0;
 	p->ainsn.target_br_reg = 0;
@@ -126,8 +128,10 @@ static void update_kprobe_inst_flag(uint template, uint  slot, uint major_opcode
  * Returns 0 if supported
  * Returns -EINVAL if unsupported
  */
-static int unsupported_inst(uint template, uint  slot, uint major_opcode,
-	unsigned long kprobe_inst, struct kprobe *p)
+static int __kprobes unsupported_inst(uint template, uint  slot,
+				      uint major_opcode,
+				      unsigned long kprobe_inst,
+				      struct kprobe *p)
 {
 	unsigned long addr = (unsigned long)p->addr;
 
@@ -168,8 +172,9 @@ static int unsupported_inst(uint template, uint  slot, uint major_opcode,
  * on which we are inserting kprobe is cmp instruction
  * with ctype as unc.
  */
-static uint is_cmp_ctype_unc_inst(uint template, uint slot, uint major_opcode,
-unsigned long kprobe_inst)
+static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot,
+					    uint major_opcode,
+					    unsigned long kprobe_inst)
 {
 	cmp_inst_t cmp_inst;
 	uint ctype_unc = 0;
@@ -201,8 +206,10 @@ out:
  * In this function we override the bundle with
  * the break instruction at the given slot.
  */
-static void prepare_break_inst(uint template, uint  slot, uint major_opcode,
-	unsigned long kprobe_inst, struct kprobe *p)
+static void __kprobes prepare_break_inst(uint template, uint  slot,
+					 uint major_opcode,
+					 unsigned long kprobe_inst,
+					 struct kprobe *p)
 {
 	unsigned long break_inst = BREAK_INST;
 	bundle_t *bundle = &p->ainsn.insn.bundle;
@@ -271,7 +278,8 @@ static inline int in_ivt_functions(unsigned long addr)
 		&& addr < (unsigned long)__end_ivt_text);
 }
 
-static int valid_kprobe_addr(int template, int slot, unsigned long addr)
+static int __kprobes valid_kprobe_addr(int template, int slot,
+				       unsigned long addr)
 {
 	if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) {
 		printk(KERN_WARNING "Attempting to insert unaligned kprobe "
@@ -323,7 +331,7 @@ static void kretprobe_trampoline(void)
  *    - cleanup by marking the instance as unused
  *    - long jump back to the original return address
  */
-int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri = NULL;
 	struct hlist_head *head;
@@ -381,7 +389,8 @@ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
         return 1;
 }
 
-void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
+				      struct pt_regs *regs)
 {
 	struct kretprobe_instance *ri;
 
@@ -399,7 +408,7 @@ void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
 	}
 }
 
-int arch_prepare_kprobe(struct kprobe *p)
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	unsigned long addr = (unsigned long) p->addr;
 	unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL);
@@ -430,7 +439,7 @@ int arch_prepare_kprobe(struct kprobe *p)
 	return 0;
 }
 
-void arch_arm_kprobe(struct kprobe *p)
+void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
 	unsigned long addr = (unsigned long)p->addr;
 	unsigned long arm_addr = addr & ~0xFULL;
@@ -439,7 +448,7 @@ void arch_arm_kprobe(struct kprobe *p)
 	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
 }
 
-void arch_disarm_kprobe(struct kprobe *p)
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
 	unsigned long addr = (unsigned long)p->addr;
 	unsigned long arm_addr = addr & ~0xFULL;
@@ -449,7 +458,7 @@ void arch_disarm_kprobe(struct kprobe *p)
 	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 }
 
@@ -461,7 +470,7 @@ void arch_remove_kprobe(struct kprobe *p)
  * to original stack address, handle the case where we need to fixup the
  * relative IP address and/or fixup branch register.
  */
-static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
 {
   	unsigned long bundle_addr = ((unsigned long) (&p->opcode.bundle)) & ~0xFULL;
   	unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL;
@@ -528,7 +537,7 @@ turn_ss_off:
   	ia64_psr(regs)->ss = 0;
 }
 
-static void prepare_ss(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 {
 	unsigned long bundle_addr = (unsigned long) &p->opcode.bundle;
 	unsigned long slot = (unsigned long)p->addr & 0xf;
@@ -545,7 +554,7 @@ static void prepare_ss(struct kprobe *p, struct pt_regs *regs)
 	ia64_psr(regs)->ss = 1;
 }
 
-static int pre_kprobes_handler(struct die_args *args)
+static int __kprobes pre_kprobes_handler(struct die_args *args)
 {
 	struct kprobe *p;
 	int ret = 0;
@@ -616,7 +625,7 @@ no_kprobe:
 	return ret;
 }
 
-static int post_kprobes_handler(struct pt_regs *regs)
+static int __kprobes post_kprobes_handler(struct pt_regs *regs)
 {
 	if (!kprobe_running())
 		return 0;
@@ -641,7 +650,7 @@ out:
 	return 1;
 }
 
-static int kprobes_fault_handler(struct pt_regs *regs, int trapnr)
+static int __kprobes kprobes_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	if (!kprobe_running())
 		return 0;
@@ -659,8 +668,8 @@ static int kprobes_fault_handler(struct pt_regs *regs, int trapnr)
 	return 0;
 }
 
-int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
-			     void *data)
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
 {
 	struct die_args *args = (struct die_args *)data;
 	switch(val) {
@@ -681,7 +690,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
 	return NOTIFY_DONE;
 }
 
-int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	unsigned long addr = ((struct fnptr *)(jp->entry))->ip;
@@ -703,7 +712,7 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 1;
 }
 
-int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	*regs = jprobe_saved_regs;
 	return 1;
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 4440c8343fa4..f970359e7edf 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -15,6 +15,7 @@
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/module.h>       /* for EXPORT_SYMBOL */
 #include <linux/hardirq.h>
+#include <linux/kprobes.h>
 
 #include <asm/fpswa.h>
 #include <asm/ia32.h>
@@ -122,7 +123,7 @@ die_if_kernel (char *str, struct pt_regs *regs, long err)
 }
 
 void
-ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
+__kprobes ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 {
 	siginfo_t siginfo;
 	int sig, code;
@@ -444,7 +445,7 @@ ia64_illegal_op_fault (unsigned long ec, long arg1, long arg2, long arg3,
 	return rv;
 }
 
-void
+void __kprobes
 ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 	    unsigned long iim, unsigned long itir, long arg5, long arg6,
 	    long arg7, struct pt_regs regs)
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index a676e79e0681..30d8564e9603 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -48,6 +48,7 @@ SECTIONS
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT
+	KPROBES_TEXT
 	*(.gnu.linkonce.t*)
     }
   .text2 : AT(ADDR(.text2) - LOAD_OFFSET)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
index 3e2cfa2c6d39..2a0d27f2f21b 100644
--- a/arch/ia64/lib/flush.S
+++ b/arch/ia64/lib/flush.S
@@ -20,6 +20,7 @@
 	 *
 	 *	Note: "in0" and "in1" are preserved for debugging purposes.
 	 */
+	.section .kprobes.text,"ax"
 GLOBAL_ENTRY(flush_icache_range)
 
 	.prologue
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ff62551eb3a1..24614869e866 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
+#include <linux/kprobes.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -76,7 +77,7 @@ mapped_kernel_page_is_present (unsigned long address)
 	return pte_present(pte);
 }
 
-void
+void __kprobes
 ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *regs)
 {
 	int signal = SIGSEGV, code = SEGV_MAPERR;
-- 
cgit v1.2.3


From 661e5a3d9958dc83d610992da85625c0ada9bb06 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Tue, 6 Sep 2005 15:19:32 -0700
Subject: [PATCH] Kprobes/IA64: fix race when break hits and kprobe not found

This patch addresses a potential race condition for a case where Kprobe has
been removed right after another CPU has taken a break hit.

The way this is addressed here is when the CPU that has taken a break hit
does not find its corresponding kprobe, then we check to see if the
original instruction got replaced with other than break.  If it got
replaced with other than break instruction, then we continue to execute
from the replaced instruction, else if we find that it is still a break,
then we let the kernel handle this, as this might be the break instruction
inserted by other than kprobe(may be kernel debugger).

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/kprobes.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'arch/ia64')

diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 82a41ac29386..4b1bd539ec47 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -554,6 +554,38 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 	ia64_psr(regs)->ss = 1;
 }
 
+static int __kprobes is_ia64_break_inst(struct pt_regs *regs)
+{
+	unsigned int slot = ia64_psr(regs)->ri;
+	unsigned int template, major_opcode;
+	unsigned long kprobe_inst;
+	unsigned long *kprobe_addr = (unsigned long *)regs->cr_iip;
+	bundle_t bundle;
+
+	memcpy(&bundle, kprobe_addr, sizeof(bundle_t));
+	template = bundle.quad0.template;
+
+	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
+	if (slot == 1 && bundle_encoding[template][1] == L)
+  		slot++;
+
+	/* Get Kprobe probe instruction at given slot*/
+	get_kprobe_inst(&bundle, slot, &kprobe_inst, &major_opcode);
+
+	/* For break instruction,
+	 * Bits 37:40 Major opcode to be zero
+	 * Bits 27:32 X6 to be zero
+	 * Bits 32:35 X3 to be zero
+	 */
+	if (major_opcode || ((kprobe_inst >> 27) & 0x1FF) ) {
+		/* Not a break instruction */
+		return 0;
+	}
+
+	/* Is a break instruction */
+	return 1;
+}
+
 static int __kprobes pre_kprobes_handler(struct die_args *args)
 {
 	struct kprobe *p;
@@ -601,6 +633,19 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	p = get_kprobe(addr);
 	if (!p) {
 		unlock_kprobes();
+		if (!is_ia64_break_inst(regs)) {
+			/*
+			 * The breakpoint instruction was removed right
+			 * after we hit it.  Another cpu has removed
+			 * either a probepoint or a debugger breakpoint
+			 * at this address.  In either case, no further
+			 * handling of this interrupt is appropriate.
+			 */
+			ret = 1;
+
+		}
+
+		/* Not one of our break, let kernel handle it */
 		goto no_kprobe;
 	}
 
-- 
cgit v1.2.3


From deac66ae454cacf942c051b86d9232af546fb187 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Tue, 6 Sep 2005 15:19:35 -0700
Subject: [PATCH] kprobes: fix bug when probed on task and isr functions

This patch fixes a race condition where in system used to hang or sometime
crash within minutes when kprobes are inserted on ISR routine and a task
routine.

The fix has been stress tested on i386, ia64, pp64 and on x86_64.  To
reproduce the problem insert kprobes on schedule() and do_IRQ() functions
and you should see hang or system crash.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c   |  3 ++-
 arch/ia64/kernel/kprobes.c   | 22 +++++++++++++++++++---
 arch/ppc64/kernel/kprobes.c  | 11 ++++++-----
 arch/x86_64/kernel/kprobes.c |  3 ++-
 include/asm-ia64/kprobes.h   |  1 +
 include/asm-ppc64/kprobes.h  |  3 +++
 kernel/kprobes.c             | 22 ++++++++++++++++++++++
 7 files changed, 55 insertions(+), 10 deletions(-)

(limited to 'arch/ia64')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index e5cec32018a5..6345b430b105 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -177,7 +177,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_eflags;
 				unlock_kprobes();
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 4b1bd539ec47..471086b808a4 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -95,6 +95,17 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
 	p->ainsn.inst_flag = 0;
 	p->ainsn.target_br_reg = 0;
 
+	/* Check for Break instruction
+ 	 * Bits 37:40 Major opcode to be zero
+	 * Bits 27:32 X6 to be zero
+	 * Bits 32:35 X3 to be zero
+	 */
+	if ((!major_opcode) && (!((kprobe_inst >> 27) & 0x1FF)) ) {
+		/* is a break instruction */
+	 	p->ainsn.inst_flag |= INST_FLAG_BREAK_INST;
+		return;
+	}
+
 	if (bundle_encoding[template][slot] == B) {
 		switch (major_opcode) {
 		  case INDIRECT_CALL_OPCODE:
@@ -542,8 +553,11 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 	unsigned long bundle_addr = (unsigned long) &p->opcode.bundle;
 	unsigned long slot = (unsigned long)p->addr & 0xf;
 
-	/* Update instruction pointer (IIP) and slot number (IPSR.ri) */
-	regs->cr_iip = bundle_addr & ~0xFULL;
+	/* single step inline if break instruction */
+	if (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)
+		regs->cr_iip = (unsigned long)p->addr & ~0xFULL;
+	else
+		regs->cr_iip = bundle_addr & ~0xFULL;
 
 	if (slot > 2)
 		slot = 0;
@@ -599,7 +613,9 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if ( (kprobe_status == KPROBE_HIT_SS) &&
+	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
+  				ia64_psr(regs)->ss = 0;
 				unlock_kprobes();
 				goto no_kprobe;
 			}
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index 591e4b67b5a5..7e80d49c589a 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -102,7 +102,7 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	regs->msr |= MSR_SE;
 
 	/* single step inline if it is a trap variant */
-	if (IS_TW(insn) || IS_TD(insn) || IS_TWI(insn) || IS_TDI(insn))
+	if (is_trap(insn))
 		regs->nip = (unsigned long)p->addr;
 	else
 		regs->nip = (unsigned long)p->ainsn.insn;
@@ -152,7 +152,9 @@ static inline int kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			kprobe_opcode_t insn = *p->ainsn.insn;
+			if (kprobe_status == KPROBE_HIT_SS &&
+					is_trap(insn)) {
 				regs->msr &= ~MSR_SE;
 				regs->msr |= kprobe_saved_msr;
 				unlock_kprobes();
@@ -192,8 +194,7 @@ static inline int kprobe_handler(struct pt_regs *regs)
 			 * trap variant, it could belong to someone else
 			 */
 			kprobe_opcode_t cur_insn = *addr;
-			if (IS_TW(cur_insn) || IS_TD(cur_insn) ||
-					IS_TWI(cur_insn) || IS_TDI(cur_insn))
+			if (is_trap(cur_insn))
 		       		goto no_kprobe;
 			/*
 			 * The breakpoint instruction was removed right
@@ -403,7 +404,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	preempt_enable();
+	preempt_enable_no_resched();
 	return ret;
 }
 
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 2d7658fbbb28..df08c43276a0 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -311,7 +311,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_rflags;
 				unlock_kprobes();
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index bf36a32e37e4..573a3574a24f 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -92,6 +92,7 @@ struct arch_specific_insn {
 	kprobe_opcode_t insn;
  #define INST_FLAG_FIX_RELATIVE_IP_ADDR		1
  #define INST_FLAG_FIX_BRANCH_REG		2
+ #define INST_FLAG_BREAK_INST			4
  	unsigned long inst_flag;
  	unsigned short target_br_reg;
 };
diff --git a/include/asm-ppc64/kprobes.h b/include/asm-ppc64/kprobes.h
index 0802919c3235..d9129d2b038e 100644
--- a/include/asm-ppc64/kprobes.h
+++ b/include/asm-ppc64/kprobes.h
@@ -42,6 +42,9 @@ typedef unsigned int kprobe_opcode_t;
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)((func_descr_t *)pentry)
 
+#define is_trap(instr)	(IS_TW(instr) || IS_TD(instr) || \
+			IS_TWI(instr) || IS_TDI(instr))
+
 #define ARCH_SUPPORTS_KRETPROBES
 void kretprobe_trampoline(void);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3b7653f2e7ae..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -155,14 +155,36 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 /* Locks kprobe: irqs must be disabled */
 void __kprobes lock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we take the kprobe_lock
+	 * and before we get a chance to update kprobe_cpu, this to prevent
+	 * deadlock when we have a kprobe on ISR routine and a kprobe on task
+	 * routine
+	 */
+	local_irq_save(flags);
+
 	spin_lock(&kprobe_lock);
 	kprobe_cpu = smp_processor_id();
+
+ 	local_irq_restore(flags);
 }
 
 void __kprobes unlock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we update
+	 * kprobe_cpu and before we get a a chance to release kprobe_lock,
+	 * this to prevent deadlock when we have a kprobe on ISR routine and
+	 * a kprobe on task routine
+	 */
+	local_irq_save(flags);
+
 	kprobe_cpu = NR_CPUS;
 	spin_unlock(&kprobe_lock);
+
+ 	local_irq_restore(flags);
 }
 
 /* You have to be holding the kprobe_lock */
-- 
cgit v1.2.3


From e72225d160a2529d6ce6d5898a267f7dae02aa6e Mon Sep 17 00:00:00 2001
From: "viro@ZenIV.linux.org.uk" <viro@ZenIV.linux.org.uk>
Date: Wed, 7 Sep 2005 23:23:50 +0100
Subject: [PATCH] bogus #if (simserial)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/hp/sim/simserial.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/ia64')

diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index 7dcb8582ae0d..b42ec37be51c 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -130,7 +130,7 @@ static void rs_stop(struct tty_struct *tty)
 
 static void rs_start(struct tty_struct *tty)
 {
-#if SIMSERIAL_DEBUG
+#ifdef SIMSERIAL_DEBUG
 	printk("rs_start: tty->stopped=%d tty->hw_stopped=%d tty->flow_stopped=%d\n",
 		tty->stopped, tty->hw_stopped, tty->flow_stopped);
 #endif
-- 
cgit v1.2.3