From 10083072bfabc40bc47306e512c158c57cf55c2e Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Fri, 14 Apr 2006 16:03:49 -0500
Subject: [PATCH] PCI: per-platform IA64_{FIRST,LAST}_DEVICE_VECTOR definitions

Abstract IA64_FIRST_DEVICE_VECTOR/IA64_LAST_DEVICE_VECTOR since SN platforms
use a subset of the IA64 range.  Implement this by making the above macros
global variables which the platform can override in it setup code.

Also add a reserve_irq_vector() routine used by SN to mark a vector's as
in-use when that weren't allocated through assign_irq_vector().

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/ia64/sn/kernel/irq.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/ia64/sn')

diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index c265e02f5036..db187f5cdae1 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -202,6 +202,9 @@ void sn_irq_init(void)
 	int i;
 	irq_desc_t *base_desc = irq_desc;
 
+	ia64_first_device_vector = IA64_SN2_FIRST_DEVICE_VECTOR;
+	ia64_last_device_vector = IA64_SN2_LAST_DEVICE_VECTOR;
+
 	for (i = 0; i < NR_IRQS; i++) {
 		if (base_desc[i].handler == &no_irq_type) {
 			base_desc[i].handler = &irq_type_sn;
@@ -285,6 +288,7 @@ void sn_irq_fixup(struct pci_dev *pci_dev, struct sn_irq_info *sn_irq_info)
 	/* link it into the sn_irq[irq] list */
 	spin_lock(&sn_irq_info_lock);
 	list_add_rcu(&sn_irq_info->list, sn_irq_lh[sn_irq_info->irq_irq]);
+	reserve_irq_vector(sn_irq_info->irq_irq);
 	spin_unlock(&sn_irq_info_lock);
 
 	register_intr_pda(sn_irq_info);
@@ -310,8 +314,11 @@ void sn_irq_unfixup(struct pci_dev *pci_dev)
 	spin_lock(&sn_irq_info_lock);
 	list_del_rcu(&sn_irq_info->list);
 	spin_unlock(&sn_irq_info_lock);
+	if (list_empty(sn_irq_lh[sn_irq_info->irq_irq]))
+		free_irq_vector(sn_irq_info->irq_irq);
 	call_rcu(&sn_irq_info->rcu, sn_irq_info_free);
 	pci_dev_put(pci_dev);
+
 }
 
 static inline void
-- 
cgit v1.2.3


From 83821d3f558dc651e555d62182ed0c95651f41a6 Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Fri, 14 Apr 2006 16:03:54 -0500
Subject: [PATCH] PCI: altix: msi support

MSI callouts for altix.  Involves a fair amount of code reorg in sn irq.c
code as well as adding some extensions to the altix PCI provider abstaction.

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/ia64/sn/kernel/io_init.c              |   9 +-
 arch/ia64/sn/kernel/irq.c                  | 135 ++++++++++---------
 arch/ia64/sn/pci/pci_dma.c                 |  10 +-
 arch/ia64/sn/pci/pcibr/pcibr_dma.c         |  62 ++++++---
 arch/ia64/sn/pci/tioca_provider.c          |   8 +-
 arch/ia64/sn/pci/tioce_provider.c          |  65 ++++++----
 drivers/pci/msi-altix.c                    | 200 ++++++++++++++++++++++++++++-
 include/asm-ia64/sn/intr.h                 |   8 ++
 include/asm-ia64/sn/pcibr_provider.h       |   5 +-
 include/asm-ia64/sn/pcibus_provider_defs.h |  17 ++-
 include/asm-ia64/sn/tiocp.h                |   3 +-
 11 files changed, 401 insertions(+), 121 deletions(-)

(limited to 'arch/ia64/sn')

diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index 5101ac462643..dc09a6a28a37 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -58,7 +58,7 @@ static int max_pcibus_number = 255;	/* Default highest pci bus number */
  */
 
 static dma_addr_t
-sn_default_pci_map(struct pci_dev *pdev, unsigned long paddr, size_t size)
+sn_default_pci_map(struct pci_dev *pdev, unsigned long paddr, size_t size, int type)
 {
 	return 0;
 }
@@ -457,13 +457,6 @@ void sn_pci_fixup_slot(struct pci_dev *dev)
 		pcidev_info->pdi_sn_irq_info = NULL;
 		kfree(sn_irq_info);
 	}
-
-	/*
-	 * MSI currently not supported on altix.  Remove this when
-	 * the MSI abstraction patches are integrated into the kernel
-	 * (sometime after 2.6.16 releases)
-	 */
-	dev->no_msi = 1;
 }
 
 /*
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index db187f5cdae1..dc8e2b696713 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -26,11 +26,11 @@ static void unregister_intr_pda(struct sn_irq_info *sn_irq_info);
 
 int sn_force_interrupt_flag = 1;
 extern int sn_ioif_inited;
-static struct list_head **sn_irq_lh;
+struct list_head **sn_irq_lh;
 static spinlock_t sn_irq_info_lock = SPIN_LOCK_UNLOCKED; /* non-IRQ lock */
 
-static inline u64 sn_intr_alloc(nasid_t local_nasid, int local_widget,
-				     u64 sn_irq_info,
+u64 sn_intr_alloc(nasid_t local_nasid, int local_widget,
+				     struct sn_irq_info *sn_irq_info,
 				     int req_irq, nasid_t req_nasid,
 				     int req_slice)
 {
@@ -40,12 +40,13 @@ static inline u64 sn_intr_alloc(nasid_t local_nasid, int local_widget,
 
 	SAL_CALL_NOLOCK(ret_stuff, (u64) SN_SAL_IOIF_INTERRUPT,
 			(u64) SAL_INTR_ALLOC, (u64) local_nasid,
-			(u64) local_widget, (u64) sn_irq_info, (u64) req_irq,
+			(u64) local_widget, __pa(sn_irq_info), (u64) req_irq,
 			(u64) req_nasid, (u64) req_slice);
+
 	return ret_stuff.status;
 }
 
-static inline void sn_intr_free(nasid_t local_nasid, int local_widget,
+void sn_intr_free(nasid_t local_nasid, int local_widget,
 				struct sn_irq_info *sn_irq_info)
 {
 	struct ia64_sal_retval ret_stuff;
@@ -112,73 +113,91 @@ static void sn_end_irq(unsigned int irq)
 
 static void sn_irq_info_free(struct rcu_head *head);
 
-static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
+struct sn_irq_info *sn_retarget_vector(struct sn_irq_info *sn_irq_info,
+				       nasid_t nasid, int slice)
 {
-	struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
-	int cpuid, cpuphys;
+	int vector;
+	int cpuphys;
+	int64_t bridge;
+	int local_widget, status;
+	nasid_t local_nasid;
+	struct sn_irq_info *new_irq_info;
+	struct sn_pcibus_provider *pci_provider;
 
-	cpuid = first_cpu(mask);
-	cpuphys = cpu_physical_id(cpuid);
+	new_irq_info = kmalloc(sizeof(struct sn_irq_info), GFP_ATOMIC);
+	if (new_irq_info == NULL)
+		return NULL;
 
-	list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
-				 sn_irq_lh[irq], list) {
-		u64 bridge;
-		int local_widget, status;
-		nasid_t local_nasid;
-		struct sn_irq_info *new_irq_info;
-		struct sn_pcibus_provider *pci_provider;
-
-		new_irq_info = kmalloc(sizeof(struct sn_irq_info), GFP_ATOMIC);
-		if (new_irq_info == NULL)
-			break;
-		memcpy(new_irq_info, sn_irq_info, sizeof(struct sn_irq_info));
-
-		bridge = (u64) new_irq_info->irq_bridge;
-		if (!bridge) {
-			kfree(new_irq_info);
-			break; /* irq is not a device interrupt */
-		}
+	memcpy(new_irq_info, sn_irq_info, sizeof(struct sn_irq_info));
 
-		local_nasid = NASID_GET(bridge);
+	bridge = (u64) new_irq_info->irq_bridge;
+	if (!bridge) {
+		kfree(new_irq_info);
+		return NULL; /* irq is not a device interrupt */
+	}
 
-		if (local_nasid & 1)
-			local_widget = TIO_SWIN_WIDGETNUM(bridge);
-		else
-			local_widget = SWIN_WIDGETNUM(bridge);
+	local_nasid = NASID_GET(bridge);
 
-		/* Free the old PROM new_irq_info structure */
-		sn_intr_free(local_nasid, local_widget, new_irq_info);
-		/* Update kernels new_irq_info with new target info */
-		unregister_intr_pda(new_irq_info);
+	if (local_nasid & 1)
+		local_widget = TIO_SWIN_WIDGETNUM(bridge);
+	else
+		local_widget = SWIN_WIDGETNUM(bridge);
 
-		/* allocate a new PROM new_irq_info struct */
-		status = sn_intr_alloc(local_nasid, local_widget,
-				       __pa(new_irq_info), irq,
-				       cpuid_to_nasid(cpuid),
-				       cpuid_to_slice(cpuid));
+	vector = sn_irq_info->irq_irq;
+	/* Free the old PROM new_irq_info structure */
+	sn_intr_free(local_nasid, local_widget, new_irq_info);
+	/* Update kernels new_irq_info with new target info */
+	unregister_intr_pda(new_irq_info);
 
-		/* SAL call failed */
-		if (status) {
-			kfree(new_irq_info);
-			break;
-		}
+	/* allocate a new PROM new_irq_info struct */
+	status = sn_intr_alloc(local_nasid, local_widget,
+			       new_irq_info, vector,
+			       nasid, slice);
+
+	/* SAL call failed */
+	if (status) {
+		kfree(new_irq_info);
+		return NULL;
+	}
 
-		new_irq_info->irq_cpuid = cpuid;
-		register_intr_pda(new_irq_info);
+	cpuphys = nasid_slice_to_cpuid(nasid, slice);
+	new_irq_info->irq_cpuid = cpuphys;
+	register_intr_pda(new_irq_info);
 
-		pci_provider = sn_pci_provider[new_irq_info->irq_bridge_type];
-		if (pci_provider && pci_provider->target_interrupt)
-			(pci_provider->target_interrupt)(new_irq_info);
+	pci_provider = sn_pci_provider[new_irq_info->irq_bridge_type];
+
+	/*
+	 * If this represents a line interrupt, target it.  If it's
+	 * an msi (irq_int_bit < 0), it's already targeted.
+	 */
+	if (new_irq_info->irq_int_bit >= 0 &&
+	    pci_provider && pci_provider->target_interrupt)
+		(pci_provider->target_interrupt)(new_irq_info);
 
-		spin_lock(&sn_irq_info_lock);
-		list_replace_rcu(&sn_irq_info->list, &new_irq_info->list);
-		spin_unlock(&sn_irq_info_lock);
-		call_rcu(&sn_irq_info->rcu, sn_irq_info_free);
+	spin_lock(&sn_irq_info_lock);
+	list_replace_rcu(&sn_irq_info->list, &new_irq_info->list);
+	spin_unlock(&sn_irq_info_lock);
+	call_rcu(&sn_irq_info->rcu, sn_irq_info_free);
 
 #ifdef CONFIG_SMP
-		set_irq_affinity_info((irq & 0xff), cpuphys, 0);
+	set_irq_affinity_info((vector & 0xff), cpuphys, 0);
 #endif
-	}
+
+	return new_irq_info;
+}
+
+static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
+	nasid_t nasid;
+	int slice;
+
+	nasid = cpuid_to_nasid(first_cpu(mask));
+	slice = cpuid_to_slice(first_cpu(mask));
+
+	list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
+				 sn_irq_lh[irq], list)
+		(void)sn_retarget_vector(sn_irq_info, nasid, slice);
 }
 
 struct hw_interrupt_type irq_type_sn = {
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index b4b84c269210..7a291a271511 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -11,7 +11,7 @@
 
 #include <linux/module.h>
 #include <asm/dma.h>
-#include <asm/sn/pcibr_provider.h>
+#include <asm/sn/intr.h>
 #include <asm/sn/pcibus_provider_defs.h>
 #include <asm/sn/pcidev.h>
 #include <asm/sn/sn_sal.h>
@@ -113,7 +113,8 @@ void *sn_dma_alloc_coherent(struct device *dev, size_t size,
 	 * resources.
 	 */
 
-	*dma_handle = provider->dma_map_consistent(pdev, phys_addr, size);
+	*dma_handle = provider->dma_map_consistent(pdev, phys_addr, size,
+						   SN_DMA_ADDR_PHYS);
 	if (!*dma_handle) {
 		printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__);
 		free_pages((unsigned long)cpuaddr, get_order(size));
@@ -176,7 +177,7 @@ dma_addr_t sn_dma_map_single(struct device *dev, void *cpu_addr, size_t size,
 	BUG_ON(dev->bus != &pci_bus_type);
 
 	phys_addr = __pa(cpu_addr);
-	dma_addr = provider->dma_map(pdev, phys_addr, size);
+	dma_addr = provider->dma_map(pdev, phys_addr, size, SN_DMA_ADDR_PHYS);
 	if (!dma_addr) {
 		printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__);
 		return 0;
@@ -260,7 +261,8 @@ int sn_dma_map_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
 	for (i = 0; i < nhwentries; i++, sg++) {
 		phys_addr = SG_ENT_PHYS_ADDRESS(sg);
 		sg->dma_address = provider->dma_map(pdev,
-						    phys_addr, sg->length);
+						    phys_addr, sg->length,
+						    SN_DMA_ADDR_PHYS);
 
 		if (!sg->dma_address) {
 			printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__);
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index 9f86bb6519aa..a86c7b945962 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -41,7 +41,7 @@ extern int sn_ioif_inited;
 
 static dma_addr_t
 pcibr_dmamap_ate32(struct pcidev_info *info,
-		   u64 paddr, size_t req_size, u64 flags)
+		   u64 paddr, size_t req_size, u64 flags, int dma_flags)
 {
 
 	struct pcidev_info *pcidev_info = info->pdi_host_pcidev_info;
@@ -81,9 +81,12 @@ pcibr_dmamap_ate32(struct pcidev_info *info,
 	if (IS_PCIX(pcibus_info))
 		ate_flags &= ~(PCI32_ATE_PREF);
 
-	xio_addr =
-	    IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) :
-	    PHYS_TO_TIODMA(paddr);
+	if (SN_DMA_ADDRTYPE(dma_flags == SN_DMA_ADDR_PHYS))
+		xio_addr = IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) :
+	    					      PHYS_TO_TIODMA(paddr);
+	else
+		xio_addr = paddr;
+
 	offset = IOPGOFF(xio_addr);
 	ate = ate_flags | (xio_addr - offset);
 
@@ -91,6 +94,13 @@ pcibr_dmamap_ate32(struct pcidev_info *info,
 	if (IS_PIC_SOFT(pcibus_info)) {
 		ate |= (pcibus_info->pbi_hub_xid << PIC_ATE_TARGETID_SHFT);
 	}
+
+	/*
+	 * If we're mapping for MSI, set the MSI bit in the ATE
+	 */
+	if (dma_flags & SN_DMA_MSI)
+		ate |= PCI32_ATE_MSI;
+
 	ate_write(pcibus_info, ate_index, ate_count, ate);
 
 	/*
@@ -105,20 +115,27 @@ pcibr_dmamap_ate32(struct pcidev_info *info,
 	if (pcibus_info->pbi_devreg[internal_device] & PCIBR_DEV_SWAP_DIR)
 		ATE_SWAP_ON(pci_addr);
 
+
 	return pci_addr;
 }
 
 static dma_addr_t
 pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr,
-			u64 dma_attributes)
+			u64 dma_attributes, int dma_flags)
 {
 	struct pcibus_info *pcibus_info = (struct pcibus_info *)
 	    ((info->pdi_host_pcidev_info)->pdi_pcibus_info);
 	u64 pci_addr;
 
 	/* Translate to Crosstalk View of Physical Address */
-	pci_addr = (IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) :
-		    PHYS_TO_TIODMA(paddr)) | dma_attributes;
+	if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS)
+		pci_addr = IS_PIC_SOFT(pcibus_info) ?
+				PHYS_TO_DMA(paddr) :
+		    		PHYS_TO_TIODMA(paddr) | dma_attributes;
+	else
+		pci_addr = IS_PIC_SOFT(pcibus_info) ?
+				paddr :
+				paddr | dma_attributes;
 
 	/* Handle Bus mode */
 	if (IS_PCIX(pcibus_info))
@@ -130,7 +147,9 @@ pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr,
 		    ((u64) pcibus_info->
 		     pbi_hub_xid << PIC_PCI64_ATTR_TARG_SHFT);
 	} else
-		pci_addr |= TIOCP_PCI64_CMDTYPE_MEM;
+		pci_addr |= (dma_flags & SN_DMA_MSI) ?
+				TIOCP_PCI64_CMDTYPE_MSI :
+				TIOCP_PCI64_CMDTYPE_MEM;
 
 	/* If PCI mode, func zero uses VCHAN0, every other func uses VCHAN1 */
 	if (!IS_PCIX(pcibus_info) && PCI_FUNC(info->pdi_linux_pcidev->devfn))
@@ -141,7 +160,7 @@ pcibr_dmatrans_direct64(struct pcidev_info * info, u64 paddr,
 
 static dma_addr_t
 pcibr_dmatrans_direct32(struct pcidev_info * info,
-			u64 paddr, size_t req_size, u64 flags)
+			u64 paddr, size_t req_size, u64 flags, int dma_flags)
 {
 	struct pcidev_info *pcidev_info = info->pdi_host_pcidev_info;
 	struct pcibus_info *pcibus_info = (struct pcibus_info *)pcidev_info->
@@ -156,8 +175,14 @@ pcibr_dmatrans_direct32(struct pcidev_info * info,
 		return 0;
 	}
 
-	xio_addr = IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) :
-	    PHYS_TO_TIODMA(paddr);
+	if (dma_flags & SN_DMA_MSI)
+		return 0;
+
+	if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS)
+		xio_addr = IS_PIC_SOFT(pcibus_info) ? PHYS_TO_DMA(paddr) :
+	    					      PHYS_TO_TIODMA(paddr);
+	else
+		xio_addr = paddr;
 
 	xio_base = pcibus_info->pbi_dir_xbase;
 	offset = xio_addr - xio_base;
@@ -327,7 +352,7 @@ void sn_dma_flush(u64 addr)
  */
 
 dma_addr_t
-pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size)
+pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size, int dma_flags)
 {
 	dma_addr_t dma_handle;
 	struct pcidev_info *pcidev_info = SN_PCIDEV_INFO(hwdev);
@@ -344,11 +369,11 @@ pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size)
 		 */
 
 		dma_handle = pcibr_dmatrans_direct64(pcidev_info, phys_addr,
-						     PCI64_ATTR_PREF);
+						     PCI64_ATTR_PREF, dma_flags);
 	} else {
 		/* Handle 32-63 bit cards via direct mapping */
 		dma_handle = pcibr_dmatrans_direct32(pcidev_info, phys_addr,
-						     size, 0);
+						     size, 0, dma_flags);
 		if (!dma_handle) {
 			/*
 			 * It is a 32 bit card and we cannot do direct mapping,
@@ -356,7 +381,8 @@ pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size)
 			 */
 
 			dma_handle = pcibr_dmamap_ate32(pcidev_info, phys_addr,
-							size, PCI32_ATE_PREF);
+							size, PCI32_ATE_PREF,
+							dma_flags);
 		}
 	}
 
@@ -365,18 +391,18 @@ pcibr_dma_map(struct pci_dev * hwdev, unsigned long phys_addr, size_t size)
 
 dma_addr_t
 pcibr_dma_map_consistent(struct pci_dev * hwdev, unsigned long phys_addr,
-			 size_t size)
+			 size_t size, int dma_flags)
 {
 	dma_addr_t dma_handle;
 	struct pcidev_info *pcidev_info = SN_PCIDEV_INFO(hwdev);
 
 	if (hwdev->dev.coherent_dma_mask == ~0UL) {
 		dma_handle = pcibr_dmatrans_direct64(pcidev_info, phys_addr,
-					    PCI64_ATTR_BAR);
+					    PCI64_ATTR_BAR, dma_flags);
 	} else {
 		dma_handle = (dma_addr_t) pcibr_dmamap_ate32(pcidev_info,
 						    phys_addr, size,
-						    PCI32_ATE_BAR);
+						    PCI32_ATE_BAR, dma_flags);
 	}
 
 	return dma_handle;
diff --git a/arch/ia64/sn/pci/tioca_provider.c b/arch/ia64/sn/pci/tioca_provider.c
index be0176912968..20de72791b97 100644
--- a/arch/ia64/sn/pci/tioca_provider.c
+++ b/arch/ia64/sn/pci/tioca_provider.c
@@ -515,10 +515,16 @@ tioca_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
  * use the GART mapped mode.
  */
 static u64
-tioca_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count)
+tioca_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags)
 {
 	u64 mapaddr;
 
+	/*
+	 * Not supported for now ...
+	 */
+	if (dma_flags & SN_DMA_MSI)
+		return 0;
+
 	/*
 	 * If card is 64 or 48 bit addresable, use a direct mapping.  32
 	 * bit direct is so restrictive w.r.t. where the memory resides that
diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index 833295624e5d..4cac7bdc7c7f 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -170,7 +170,8 @@ tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr)
 	(ATE_PAGE((start)+(len)-1, pagesize) - ATE_PAGE(start, pagesize) + 1)
 
 #define ATE_VALID(ate)	((ate) & (1UL << 63))
-#define ATE_MAKE(addr, ps) (((addr) & ~ATE_PAGEMASK(ps)) | (1UL << 63))
+#define ATE_MAKE(addr, ps, msi) \
+	(((addr) & ~ATE_PAGEMASK(ps)) | (1UL << 63) | ((msi)?(1UL << 62):0))
 
 /*
  * Flavors of ate-based mapping supported by tioce_alloc_map()
@@ -196,15 +197,17 @@ tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr)
  *
  * 63    - must be 1 to indicate d64 mode to CE hardware
  * 62    - barrier bit ... controlled with tioce_dma_barrier()
- * 61    - 0 since this is not an MSI transaction
+ * 61    - msi bit ... specified through dma_flags
  * 60:54 - reserved, MBZ
  */
 static u64
-tioce_dma_d64(unsigned long ct_addr)
+tioce_dma_d64(unsigned long ct_addr, int dma_flags)
 {
 	u64 bus_addr;
 
 	bus_addr = ct_addr | (1UL << 63);
+	if (dma_flags & SN_DMA_MSI)
+		bus_addr |= (1UL << 61);
 
 	return bus_addr;
 }
@@ -261,7 +264,7 @@ pcidev_to_tioce(struct pci_dev *pdev, struct tioce **base,
  */
 static u64
 tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
-		u64 ct_addr, int len)
+		u64 ct_addr, int len, int dma_flags)
 {
 	int i;
 	int j;
@@ -270,6 +273,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 	int entries;
 	int nates;
 	u64 pagesize;
+	int msi_capable, msi_wanted;
 	u64 *ate_shadow;
 	u64 *ate_reg;
 	u64 addr;
@@ -291,6 +295,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 		ate_reg = ce_mmr->ce_ure_ate3240;
 		pagesize = ce_kern->ce_ate3240_pagesize;
 		bus_base = TIOCE_M32_MIN;
+		msi_capable = 1;
 		break;
 	case TIOCE_ATE_M40:
 		first = 0;
@@ -299,6 +304,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 		ate_reg = ce_mmr->ce_ure_ate40;
 		pagesize = MB(64);
 		bus_base = TIOCE_M40_MIN;
+		msi_capable = 0;
 		break;
 	case TIOCE_ATE_M40S:
 		/*
@@ -311,11 +317,16 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 		ate_reg = ce_mmr->ce_ure_ate3240;
 		pagesize = GB(16);
 		bus_base = TIOCE_M40S_MIN;
+		msi_capable = 0;
 		break;
 	default:
 		return 0;
 	}
 
+	msi_wanted = dma_flags & SN_DMA_MSI;
+	if (msi_wanted && !msi_capable)
+		return 0;
+
 	nates = ATE_NPAGES(ct_addr, len, pagesize);
 	if (nates > entries)
 		return 0;
@@ -344,7 +355,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 	for (j = 0; j < nates; j++) {
 		u64 ate;
 
-		ate = ATE_MAKE(addr, pagesize);
+		ate = ATE_MAKE(addr, pagesize, msi_wanted);
 		ate_shadow[i + j] = ate;
 		tioce_mmr_storei(ce_kern, &ate_reg[i + j], ate);
 		addr += pagesize;
@@ -371,7 +382,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
  * Map @paddr into 32-bit bus space of the CE associated with @pcidev_info.
  */
 static u64
-tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr)
+tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr, int dma_flags)
 {
 	int dma_ok;
 	int port;
@@ -381,6 +392,9 @@ tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr)
 	u64 ct_lower;
 	dma_addr_t bus_addr;
 
+	if (dma_flags & SN_DMA_MSI)
+		return 0;
+
 	ct_upper = ct_addr & ~0x3fffffffUL;
 	ct_lower = ct_addr & 0x3fffffffUL;
 
@@ -507,7 +521,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
  */
 static u64
 tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
-		 int barrier)
+		 int barrier, int dma_flags)
 {
 	unsigned long flags;
 	u64 ct_addr;
@@ -523,15 +537,18 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
 	if (dma_mask < 0x7fffffffUL)
 		return 0;
 
-	ct_addr = PHYS_TO_TIODMA(paddr);
+	if (SN_DMA_ADDRTYPE(dma_flags) == SN_DMA_ADDR_PHYS)
+		ct_addr = PHYS_TO_TIODMA(paddr);
+	else
+		ct_addr = paddr;
 
 	/*
 	 * If the device can generate 64 bit addresses, create a D64 map.
-	 * Since this should never fail, bypass the rest of the checks.
 	 */
 	if (dma_mask == ~0UL) {
-		mapaddr = tioce_dma_d64(ct_addr);
-		goto dma_map_done;
+		mapaddr = tioce_dma_d64(ct_addr, dma_flags);
+		if (mapaddr)
+			goto dma_map_done;
 	}
 
 	pcidev_to_tioce(pdev, NULL, &ce_kern, &port);
@@ -574,18 +591,22 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
 
 		if (byte_count > MB(64)) {
 			mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40S,
-						  port, ct_addr, byte_count);
+						  port, ct_addr, byte_count,
+						  dma_flags);
 			if (!mapaddr)
 				mapaddr =
 				    tioce_alloc_map(ce_kern, TIOCE_ATE_M40, -1,
-						    ct_addr, byte_count);
+						    ct_addr, byte_count,
+						    dma_flags);
 		} else {
 			mapaddr = tioce_alloc_map(ce_kern, TIOCE_ATE_M40, -1,
-						  ct_addr, byte_count);
+						  ct_addr, byte_count,
+						  dma_flags);
 			if (!mapaddr)
 				mapaddr =
 				    tioce_alloc_map(ce_kern, TIOCE_ATE_M40S,
-						    port, ct_addr, byte_count);
+						    port, ct_addr, byte_count,
+						    dma_flags);
 		}
 	}
 
@@ -593,7 +614,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
 	 * 32-bit direct is the next mode to try
 	 */
 	if (!mapaddr && dma_mask >= 0xffffffffUL)
-		mapaddr = tioce_dma_d32(pdev, ct_addr);
+		mapaddr = tioce_dma_d32(pdev, ct_addr, dma_flags);
 
 	/*
 	 * Last resort, try 32-bit ATE-based map.
@@ -601,7 +622,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
 	if (!mapaddr)
 		mapaddr =
 		    tioce_alloc_map(ce_kern, TIOCE_ATE_M32, -1, ct_addr,
-				    byte_count);
+				    byte_count, dma_flags);
 
 	spin_unlock_irqrestore(&ce_kern->ce_lock, flags);
 
@@ -622,9 +643,9 @@ dma_map_done:
  * in the address.
  */
 static u64
-tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count)
+tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags)
 {
-	return tioce_do_dma_map(pdev, paddr, byte_count, 0);
+	return tioce_do_dma_map(pdev, paddr, byte_count, 0, dma_flags);
 }
 
 /**
@@ -636,9 +657,9 @@ tioce_dma(struct pci_dev *pdev, u64 paddr, size_t byte_count)
  * Simply call tioce_do_dma_map() to create a map with the barrier bit set
  * in the address.
  */ static u64
-tioce_dma_consistent(struct pci_dev *pdev, u64 paddr, size_t byte_count)
+tioce_dma_consistent(struct pci_dev *pdev, u64 paddr, size_t byte_count, int dma_flags)
 {
-	return tioce_do_dma_map(pdev, paddr, byte_count, 1);
+	return tioce_do_dma_map(pdev, paddr, byte_count, 1, dma_flags);
 }
 
 /**
@@ -696,7 +717,7 @@ tioce_reserve_m32(struct tioce_kernel *ce_kern, u64 base, u64 limit)
 	while (ate_index <= last_ate) {
 		u64 ate;
 
-		ate = ATE_MAKE(0xdeadbeef, ps);
+		ate = ATE_MAKE(0xdeadbeef, ps, 0);
 		ce_kern->ce_ate3240_shadow[ate_index] = ate;
 		tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_ate3240[ate_index],
 				 ate);
diff --git a/drivers/pci/msi-altix.c b/drivers/pci/msi-altix.c
index 9bd240602c1e..bed4183a5e39 100644
--- a/drivers/pci/msi-altix.c
+++ b/drivers/pci/msi-altix.c
@@ -6,13 +6,205 @@
  * Copyright (C) 2006 Silicon Graphics, Inc.  All Rights Reserved.
  */
 
-#include <asm/errno.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/cpumask.h>
+
+#include <asm/sn/addrs.h>
+#include <asm/sn/intr.h>
+#include <asm/sn/pcibus_provider_defs.h>
+#include <asm/sn/pcidev.h>
+#include <asm/sn/nodepda.h>
+
+#include "msi.h"
+
+struct sn_msi_info {
+	u64 pci_addr;
+	struct sn_irq_info *sn_irq_info;
+};
+
+static struct sn_msi_info *sn_msi_info;
+
+static void
+sn_msi_teardown(unsigned int vector)
+{
+	nasid_t nasid;
+	int widget;
+	struct pci_dev *pdev;
+	struct pcidev_info *sn_pdev;
+	struct sn_irq_info *sn_irq_info;
+	struct pcibus_bussoft *bussoft;
+	struct sn_pcibus_provider *provider;
+
+	sn_irq_info = sn_msi_info[vector].sn_irq_info;
+	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
+		return;
+
+	sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
+	pdev = sn_pdev->pdi_linux_pcidev;
+	provider = SN_PCIDEV_BUSPROVIDER(pdev);
+
+	(*provider->dma_unmap)(pdev,
+			       sn_msi_info[vector].pci_addr,
+			       PCI_DMA_FROMDEVICE);
+	sn_msi_info[vector].pci_addr = 0;
+
+	bussoft = SN_PCIDEV_BUSSOFT(pdev);
+	nasid = NASID_GET(bussoft->bs_base);
+	widget = (nasid & 1) ?
+			TIO_SWIN_WIDGETNUM(bussoft->bs_base) :
+			SWIN_WIDGETNUM(bussoft->bs_base);
+
+	sn_intr_free(nasid, widget, sn_irq_info);
+	sn_msi_info[vector].sn_irq_info = NULL;
+
+	return;
+}
 
 int
-sn_msi_init(void)
+sn_msi_setup(struct pci_dev *pdev, unsigned int vector,
+	     u32 *addr_hi, u32 *addr_lo, u32 *data)
 {
+	int widget;
+	int status;
+	nasid_t nasid;
+	u64 bus_addr;
+	struct sn_irq_info *sn_irq_info;
+	struct pcibus_bussoft *bussoft = SN_PCIDEV_BUSSOFT(pdev);
+	struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev);
+
+	if (bussoft == NULL)
+		return -EINVAL;
+
+	if (provider == NULL || provider->dma_map_consistent == NULL)
+		return -EINVAL;
+
+	/*
+	 * Set up the vector plumbing.  Let the prom (via sn_intr_alloc)
+	 * decide which cpu to direct this msi at by default.
+	 */
+
+	nasid = NASID_GET(bussoft->bs_base);
+	widget = (nasid & 1) ?
+			TIO_SWIN_WIDGETNUM(bussoft->bs_base) :
+			SWIN_WIDGETNUM(bussoft->bs_base);
+
+	sn_irq_info = kzalloc(sizeof(struct sn_irq_info), GFP_KERNEL);
+	if (! sn_irq_info)
+		return -ENOMEM;
+
+	status = sn_intr_alloc(nasid, widget, sn_irq_info, vector, -1, -1);
+	if (status) {
+		kfree(sn_irq_info);
+		return -ENOMEM;
+	}
+
+	sn_irq_info->irq_int_bit = -1;		/* mark this as an MSI irq */
+	sn_irq_fixup(pdev, sn_irq_info);
+
+	/* Prom probably should fill these in, but doesn't ... */
+	sn_irq_info->irq_bridge_type = bussoft->bs_asic_type;
+	sn_irq_info->irq_bridge = (void *)bussoft->bs_base;
+
 	/*
-	 * return error until MSI is supported on altix platforms
+	 * Map the xio address into bus space
 	 */
-	return -EINVAL;
+	bus_addr = (*provider->dma_map_consistent)(pdev,
+					sn_irq_info->irq_xtalkaddr,
+					sizeof(sn_irq_info->irq_xtalkaddr),
+					SN_DMA_MSI|SN_DMA_ADDR_XIO);
+	if (! bus_addr) {
+		sn_intr_free(nasid, widget, sn_irq_info);
+		kfree(sn_irq_info);
+		return -ENOMEM;
+	}
+
+	sn_msi_info[vector].sn_irq_info = sn_irq_info;
+	sn_msi_info[vector].pci_addr = bus_addr;
+
+	*addr_hi = (u32)(bus_addr >> 32);
+	*addr_lo = (u32)(bus_addr & 0x00000000ffffffff);
+
+	/*
+	 * In the SN platform, bit 16 is a "send vector" bit which
+	 * must be present in order to move the vector through the system.
+	 */
+	*data = 0x100 + (unsigned int)vector;
+
+#ifdef CONFIG_SMP
+	set_irq_affinity_info((vector & 0xff), sn_irq_info->irq_cpuid, 0);
+#endif
+
+	return 0;
+}
+
+static void
+sn_msi_target(unsigned int vector, unsigned int cpu,
+	      u32 *addr_hi, u32 *addr_lo)
+{
+	int slice;
+	nasid_t nasid;
+	u64 bus_addr;
+	struct pci_dev *pdev;
+	struct pcidev_info *sn_pdev;
+	struct sn_irq_info *sn_irq_info;
+	struct sn_irq_info *new_irq_info;
+	struct sn_pcibus_provider *provider;
+
+	sn_irq_info = sn_msi_info[vector].sn_irq_info;
+	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
+		return;
+
+	/*
+	 * Release XIO resources for the old MSI PCI address
+	 */
+
+        sn_pdev = (struct pcidev_info *)sn_irq_info->irq_pciioinfo;
+	pdev = sn_pdev->pdi_linux_pcidev;
+	provider = SN_PCIDEV_BUSPROVIDER(pdev);
+
+	bus_addr = (u64)(*addr_hi) << 32 | (u64)(*addr_lo);
+	(*provider->dma_unmap)(pdev, bus_addr, PCI_DMA_FROMDEVICE);
+	sn_msi_info[vector].pci_addr = 0;
+
+	nasid = cpuid_to_nasid(cpu);
+	slice = cpuid_to_slice(cpu);
+
+	new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
+	sn_msi_info[vector].sn_irq_info = new_irq_info;
+	if (new_irq_info == NULL)
+		return;
+
+	/*
+	 * Map the xio address into bus space
+	 */
+
+	bus_addr = (*provider->dma_map_consistent)(pdev,
+					new_irq_info->irq_xtalkaddr,
+					sizeof(new_irq_info->irq_xtalkaddr),
+					SN_DMA_MSI|SN_DMA_ADDR_XIO);
+
+	sn_msi_info[vector].pci_addr = bus_addr;
+	*addr_hi = (u32)(bus_addr >> 32);
+	*addr_lo = (u32)(bus_addr & 0x00000000ffffffff);
+}
+
+struct msi_ops sn_msi_ops = {
+	.setup = sn_msi_setup,
+	.teardown = sn_msi_teardown,
+#ifdef CONFIG_SMP
+	.target = sn_msi_target,
+#endif
+};
+
+int
+sn_msi_init(void)
+{
+	sn_msi_info =
+		kzalloc(sizeof(struct sn_msi_info) * NR_VECTORS, GFP_KERNEL);
+	if (! sn_msi_info)
+		return -ENOMEM;
+
+	msi_register(&sn_msi_ops);
+	return 0;
 }
diff --git a/include/asm-ia64/sn/intr.h b/include/asm-ia64/sn/intr.h
index 60a51a406eec..12b54ddb06be 100644
--- a/include/asm-ia64/sn/intr.h
+++ b/include/asm-ia64/sn/intr.h
@@ -10,6 +10,7 @@
 #define _ASM_IA64_SN_INTR_H
 
 #include <linux/rcupdate.h>
+#include <asm/sn/types.h>
 
 #define SGI_UART_VECTOR		0xe9
 
@@ -40,6 +41,7 @@ struct sn_irq_info {
 	int		irq_cpuid;	/* kernel logical cpuid	     */
 	int		irq_irq;	/* the IRQ number */
 	int		irq_int_bit;	/* Bridge interrupt pin */
+					/* <0 means MSI */
 	u64	irq_xtalkaddr;	/* xtalkaddr IRQ is sent to  */
 	int		irq_bridge_type;/* pciio asic type (pciio.h) */
 	void	       *irq_bridge;	/* bridge generating irq     */
@@ -53,6 +55,12 @@ struct sn_irq_info {
 };
 
 extern void sn_send_IPI_phys(int, long, int, int);
+extern u64 sn_intr_alloc(nasid_t, int,
+			      struct sn_irq_info *,
+			      int, nasid_t, int);
+extern void sn_intr_free(nasid_t, int, struct sn_irq_info *);
+extern struct sn_irq_info *sn_retarget_vector(struct sn_irq_info *, nasid_t, int);
+extern struct list_head **sn_irq_lh;
 
 #define CPU_VECTOR_TO_IRQ(cpuid,vector) (vector)
 
diff --git a/include/asm-ia64/sn/pcibr_provider.h b/include/asm-ia64/sn/pcibr_provider.h
index 51260ab70d91..e3b0c3fe5eed 100644
--- a/include/asm-ia64/sn/pcibr_provider.h
+++ b/include/asm-ia64/sn/pcibr_provider.h
@@ -55,6 +55,7 @@
 #define PCI32_ATE_V                     (0x1 << 0)
 #define PCI32_ATE_CO                    (0x1 << 1)
 #define PCI32_ATE_PREC                  (0x1 << 2)
+#define PCI32_ATE_MSI                   (0x1 << 2)
 #define PCI32_ATE_PREF                  (0x1 << 3)
 #define PCI32_ATE_BAR                   (0x1 << 4)
 #define PCI32_ATE_ADDR_SHFT             12
@@ -117,8 +118,8 @@ struct pcibus_info {
 
 extern int  pcibr_init_provider(void);
 extern void *pcibr_bus_fixup(struct pcibus_bussoft *, struct pci_controller *);
-extern dma_addr_t pcibr_dma_map(struct pci_dev *, unsigned long, size_t);
-extern dma_addr_t pcibr_dma_map_consistent(struct pci_dev *, unsigned long, size_t);
+extern dma_addr_t pcibr_dma_map(struct pci_dev *, unsigned long, size_t, int type);
+extern dma_addr_t pcibr_dma_map_consistent(struct pci_dev *, unsigned long, size_t, int type);
 extern void pcibr_dma_unmap(struct pci_dev *, dma_addr_t, int);
 
 /*
diff --git a/include/asm-ia64/sn/pcibus_provider_defs.h b/include/asm-ia64/sn/pcibus_provider_defs.h
index ce3f6c328241..8f7c83d0f6d3 100644
--- a/include/asm-ia64/sn/pcibus_provider_defs.h
+++ b/include/asm-ia64/sn/pcibus_provider_defs.h
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 1992 - 1997, 2000-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 1992 - 1997, 2000-2005 Silicon Graphics, Inc. All rights reserved.
  */
 #ifndef _ASM_IA64_SN_PCI_PCIBUS_PROVIDER_H
 #define _ASM_IA64_SN_PCI_PCIBUS_PROVIDER_H
@@ -45,13 +45,24 @@ struct pci_controller;
  */
 
 struct sn_pcibus_provider {
-	dma_addr_t	(*dma_map)(struct pci_dev *, unsigned long, size_t);
-	dma_addr_t	(*dma_map_consistent)(struct pci_dev *, unsigned long, size_t);
+	dma_addr_t	(*dma_map)(struct pci_dev *, unsigned long, size_t, int flags);
+	dma_addr_t	(*dma_map_consistent)(struct pci_dev *, unsigned long, size_t, int flags);
 	void		(*dma_unmap)(struct pci_dev *, dma_addr_t, int);
 	void *		(*bus_fixup)(struct pcibus_bussoft *, struct pci_controller *);
  	void		(*force_interrupt)(struct sn_irq_info *);
  	void		(*target_interrupt)(struct sn_irq_info *);
 };
 
+/*
+ * Flags used by the map interfaces
+ * bits 3:0 specifies format of passed in address
+ * bit  4   specifies that address is to be used for MSI
+ */
+
+#define SN_DMA_ADDRTYPE(x)	((x) & 0xf)
+#define     SN_DMA_ADDR_PHYS	1	/* address is an xio address. */
+#define     SN_DMA_ADDR_XIO	2	/* address is phys memory */
+#define SN_DMA_MSI		0x10	/* Bus address is to be used for MSI */
+
 extern struct sn_pcibus_provider *sn_pci_provider[];
 #endif				/* _ASM_IA64_SN_PCI_PCIBUS_PROVIDER_H */
diff --git a/include/asm-ia64/sn/tiocp.h b/include/asm-ia64/sn/tiocp.h
index f47c08ab483c..e8ad0bb5b6c5 100644
--- a/include/asm-ia64/sn/tiocp.h
+++ b/include/asm-ia64/sn/tiocp.h
@@ -3,13 +3,14 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2003-2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2003-2005 Silicon Graphics, Inc. All rights reserved.
  */
 #ifndef _ASM_IA64_SN_PCI_TIOCP_H
 #define _ASM_IA64_SN_PCI_TIOCP_H
 
 #define TIOCP_HOST_INTR_ADDR            0x003FFFFFFFFFFFFFUL
 #define TIOCP_PCI64_CMDTYPE_MEM         (0x1ull << 60)
+#define TIOCP_PCI64_CMDTYPE_MSI         (0x3ull << 60)
 
 
 /*****************************************************************************
-- 
cgit v1.2.3


From a1d7057727bc9e20a0a417812f538fe6b8a02c03 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 31 May 2006 08:58:08 -0500
Subject: [IA64-SGI] SN topology fix for large systems

There is an SN bug in sn_hwperf.c that affects systems with 1024n or 1024p.
The bug manifests itself 2 ways: IO interrupts are not always
targeted to the nearest node, and 2) the "cat /proc/sgi_sn/sn_topology"
commands fails with "cannot allocate memory".

The code is using the wrong macros for validating node numbers.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/sn2/sn_hwperf.c | 50 ++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 23 deletions(-)

(limited to 'arch/ia64/sn')

diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 739c948dc504..9a8a29339d2d 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -51,6 +51,8 @@ static nasid_t sn_hwperf_master_nasid = INVALID_NASID;
 static int sn_hwperf_init(void);
 static DECLARE_MUTEX(sn_hwperf_init_mutex);
 
+#define cnode_possible(n)	((n) < num_cnodes)
+
 static int sn_hwperf_enum_objects(int *nobj, struct sn_hwperf_object_info **ret)
 {
 	int e;
@@ -127,14 +129,14 @@ static int sn_hwperf_geoid_to_cnode(char *location)
 		}
 	}
 
-	return node_possible(cnode) ? cnode : -1;
+	return cnode_possible(cnode) ? cnode : -1;
 }
 
 static int sn_hwperf_obj_to_cnode(struct sn_hwperf_object_info * obj)
 {
 	if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj))
 		BUG();
-	if (!obj->sn_hwp_this_part)
+	if (SN_HWPERF_FOREIGN(obj))
 		return -1;
 	return sn_hwperf_geoid_to_cnode(obj->location);
 }
@@ -199,12 +201,12 @@ static void print_pci_topology(struct seq_file *s)
 
 static inline int sn_hwperf_has_cpus(cnodeid_t node)
 {
-	return node_online(node) && nr_cpus_node(node);
+	return node < MAX_NUMNODES && node_online(node) && nr_cpus_node(node);
 }
 
 static inline int sn_hwperf_has_mem(cnodeid_t node)
 {
-	return node_online(node) && NODE_DATA(node)->node_present_pages;
+	return node < MAX_NUMNODES && node_online(node) && NODE_DATA(node)->node_present_pages;
 }
 
 static struct sn_hwperf_object_info *
@@ -237,7 +239,7 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
 	int found_mem = 0;
 	int found_cpu = 0;
 
-	if (!node_possible(node))
+	if (!cnode_possible(node))
 		return -EINVAL;
 
 	if (sn_hwperf_has_cpus(node)) {
@@ -442,7 +444,7 @@ static int sn_topology_show(struct seq_file *s, void *d)
 	seq_printf(s, "%s %d %s %s asic %s", slabname, ordinal, obj->location,
 		obj->sn_hwp_this_part ? "local" : "shared", obj->name);
 
-	if (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj))
+	if (ordinal < 0 || (!SN_HWPERF_IS_NODE(obj) && !SN_HWPERF_IS_IONODE(obj)))
 		seq_putc(s, '\n');
 	else {
 		cnodeid_t near_mem = -1;
@@ -468,22 +470,24 @@ static int sn_topology_show(struct seq_file *s, void *d)
 		/*
 		 * CPUs on this node, if any
 		 */
-		cpumask = node_to_cpumask(ordinal);
-		for_each_online_cpu(i) {
-			if (cpu_isset(i, cpumask)) {
-				slice = 'a' + cpuid_to_slice(i);
-				c = cpu_data(i);
-				seq_printf(s, "cpu %d %s%c local"
-					" freq %luMHz, arch ia64",
-					i, obj->location, slice,
-					c->proc_freq / 1000000);
-				for_each_online_cpu(j) {
-					seq_printf(s, j ? ":%d" : ", dist %d",
-						node_distance(
-						    cpu_to_node(i),
-						    cpu_to_node(j)));
+		if (!SN_HWPERF_IS_IONODE(obj)) {
+			cpumask = node_to_cpumask(ordinal);
+			for_each_online_cpu(i) {
+				if (cpu_isset(i, cpumask)) {
+					slice = 'a' + cpuid_to_slice(i);
+					c = cpu_data(i);
+					seq_printf(s, "cpu %d %s%c local"
+						" freq %luMHz, arch ia64",
+						i, obj->location, slice,
+						c->proc_freq / 1000000);
+					for_each_online_cpu(j) {
+						seq_printf(s, j ? ":%d" : ", dist %d",
+							node_distance(
+						    	cpu_to_node(i),
+						    	cpu_to_node(j)));
+					}
+					seq_putc(s, '\n');
 				}
-				seq_putc(s, '\n');
 			}
 		}
 	}
@@ -523,7 +527,7 @@ static int sn_topology_show(struct seq_file *s, void *d)
 			if (obj->sn_hwp_this_part && p->sn_hwp_this_part)
 				/* both ends local to this partition */
 				seq_puts(s, " local");
-			else if (!obj->sn_hwp_this_part && !p->sn_hwp_this_part)
+			else if (SN_HWPERF_FOREIGN(p))
 				/* both ends of the link in foreign partiton */
 				seq_puts(s, " foreign");
 			else
@@ -776,7 +780,7 @@ sn_hwperf_ioctl(struct inode *in, struct file *fp, u32 op, u64 arg)
 
 	case SN_HWPERF_GET_NODE_NASID:
 		if (a.sz != sizeof(u64) ||
-		   (node = a.arg) < 0 || !node_possible(node)) {
+		   (node = a.arg) < 0 || !cnode_possible(node)) {
 			r = -EINVAL;
 			goto error;
 		}
-- 
cgit v1.2.3


From f640f94ec4f39e8a4d91d58354d3e09b28769edf Mon Sep 17 00:00:00 2001
From: Mike Habeck <habeck@sgi.com>
Date: Mon, 19 Jun 2006 15:38:10 -0500
Subject: [IA64-SGI] fix SGI Altix tioce_bus_fixup() bug

The following patch fixes a bug in the SGI Altix tioce_bus_fixup()
code.  ce_dre_comp_err_addr needs to be zero'd out not ~0ULL.  As
a result completion errors weren't being captured.

Signed-off-by: Mike Habeck <habeck@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/pci/tioce_provider.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/ia64/sn')

diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index 833295624e5d..85f3b3d4c606 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2003-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (C) 2003-2006 Silicon Graphics, Inc.  All Rights Reserved.
  */
 
 #include <linux/types.h>
@@ -1002,7 +1002,7 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_int_status_alias, ~0ULL);
 	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_error_summary_alias,
 		       ~0ULL);
-	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_dre_comp_err_addr, ~0ULL);
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_dre_comp_err_addr, 0ULL);
 
 	if (request_irq(SGI_PCIASIC_ERROR,
 			tioce_error_intr_handler,
-- 
cgit v1.2.3


From 762834e8bf46bf41ce9034d062a7c1f8563175f3 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:19 -0700
Subject: [PATCH] Unify pxm_to_node() and node_to_pxm()

Consolidate the various arch-specific implementations of pxm_to_node() and
node_to_pxm() into a single generic version.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig               |  6 ++++++
 arch/i386/kernel/srat.c         | 19 ++--------------
 arch/ia64/hp/common/sba_iommu.c |  2 +-
 arch/ia64/kernel/acpi.c         | 24 +++++++--------------
 arch/ia64/pci/pci.c             |  2 +-
 arch/ia64/sn/kernel/setup.c     |  4 ++--
 arch/x86_64/mm/srat.c           | 33 +---------------------------
 drivers/acpi/Kconfig            |  2 +-
 drivers/acpi/numa.c             | 48 +++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_numa.h        | 23 ++++++++++++++++++++
 include/asm-x86_64/numa.h       |  1 -
 include/linux/acpi.h            |  9 ++++++++
 12 files changed, 102 insertions(+), 71 deletions(-)
 create mode 100644 include/acpi/acpi_numa.h

(limited to 'arch/ia64/sn')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8dfa3054f10f..15d23da2455f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -173,6 +173,12 @@ config ACPI_SRAT
 	bool
 	default y
 	depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
+	select ACPI_NUMA
+
+config HAVE_ARCH_PARSE_SRAT
+       bool
+       default y
+       depends on ACPI_SRAT
 
 config X86_SUMMIT_NUMA
 	bool
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index 52b3ed5d2cb5..989c85255dbe 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -39,7 +39,6 @@
 #define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
 #define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
 #define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-#define MAX_PXM_DOMAINS		256	/* 1 byte and no promises about values */
 /* bitmap length; _PXM is at most 255 */
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
 static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
@@ -213,19 +212,11 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 		node_end_pfn[nid] = memory_chunk->end_pfn;
 }
 
-static u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */
-
-int pxm_to_node(int pxm)
-{
-	return pxm_to_nid_map[pxm];
-}
-
 /* Parse the ACPI Static Resource Affinity Table */
 static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 {
 	u8 *start, *end, *p;
 	int i, j, nid;
-	u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */
 
 	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
 	p = start;
@@ -235,10 +226,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
 	memset(zholes_size, 0, sizeof(zholes_size));
 
-	/* -1 in these maps means not available */
-	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
-	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
-
 	num_memory_chunks = 0;
 	while (p < end) {
 		switch (*p) {
@@ -278,9 +265,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 	nodes_clear(node_online_map);
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (BMAP_TEST(pxm_bitmap, i)) {
-			nid = num_online_nodes();
-			pxm_to_nid_map[i] = nid;
-			nid_to_pxm_map[nid] = i;
+			int nid = acpi_map_pxm_to_node(i);
 			node_set_online(nid);
 		}
 	}
@@ -288,7 +273,7 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 
 	/* set cnode id in memory chunk structure */
 	for (i = 0; i < num_memory_chunks; i++)
-		node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm];
+		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
 
 	printk("pxm bitmap: ");
 	for (i = 0; i < sizeof(pxm_bitmap); i++) {
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index bdccd0b1eb60..3ce443e6c016 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1958,7 +1958,7 @@ sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
 	if (pxm < 0)
 		return;
 
-	node = pxm_to_nid_map[pxm];
+	node = pxm_to_node(pxm);
 
 	if (node >= MAX_NUMNODES || !node_online(node))
 		return;
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 58c93a30348c..d1c52cf67882 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -415,9 +415,6 @@ static int __initdata srat_num_cpus;	/* number of cpus */
 static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
 #define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
 #define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
-/* maps to convert between proximity domain and logical node ID */
-int __devinitdata pxm_to_nid_map[MAX_PXM_DOMAINS];
-int __initdata nid_to_pxm_map[MAX_NUMNODES];
 static struct acpi_table_slit __initdata *slit_table;
 
 static int get_processor_proximity_domain(struct acpi_table_processor_affinity *pa)
@@ -533,22 +530,17 @@ void __init acpi_numa_arch_fixup(void)
 	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
 	 * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES.
 	 */
-	/* calculate total number of nodes in system from PXM bitmap */
-	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
-	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
 	nodes_clear(node_online_map);
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (pxm_bit_test(i)) {
-			int nid = num_online_nodes();
-			pxm_to_nid_map[i] = nid;
-			nid_to_pxm_map[nid] = i;
+			int nid = acpi_map_pxm_to_node(i);
 			node_set_online(nid);
 		}
 	}
 
 	/* set logical node id in memory chunk structure */
 	for (i = 0; i < num_node_memblks; i++)
-		node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+		node_memblk[i].nid = pxm_to_node(node_memblk[i].nid);
 
 	/* assign memory bank numbers for each chunk on each node */
 	for_each_online_node(i) {
@@ -562,7 +554,7 @@ void __init acpi_numa_arch_fixup(void)
 
 	/* set logical node id in cpu structure */
 	for (i = 0; i < srat_num_cpus; i++)
-		node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
 
 	printk(KERN_INFO "Number of logical nodes in system = %d\n",
 	       num_online_nodes());
@@ -575,11 +567,11 @@ void __init acpi_numa_arch_fixup(void)
 	for (i = 0; i < slit_table->localities; i++) {
 		if (!pxm_bit_test(i))
 			continue;
-		node_from = pxm_to_nid_map[i];
+		node_from = pxm_to_node(i);
 		for (j = 0; j < slit_table->localities; j++) {
 			if (!pxm_bit_test(j))
 				continue;
-			node_to = pxm_to_nid_map[j];
+			node_to = pxm_to_node(j);
 			node_distance(node_from, node_to) =
 			    slit_table->entry[i * slit_table->localities + j];
 		}
@@ -785,9 +777,9 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, long physid)
 
 	/*
 	 * Assuming that the container driver would have set the proximity
-	 * domain and would have initialized pxm_to_nid_map[pxm_id] && pxm_flag
+	 * domain and would have initialized pxm_to_node(pxm_id) && pxm_flag
 	 */
-	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_nid_map[pxm_id];
+	node_cpuid[cpu].nid = (pxm_id < 0) ? 0 : pxm_to_node(pxm_id);
 
 	node_cpuid[cpu].phys_id = physid;
 #endif
@@ -966,7 +958,7 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 	if (pxm < 0)
 		return AE_OK;
 
-	node = pxm_to_nid_map[pxm];
+	node = pxm_to_node(pxm);
 
 	if (node >= MAX_NUMNODES || !node_online(node) ||
 	    cpus_empty(node_to_cpumask(node)))
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index ab829a22f8a4..cf7751b99d1c 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -352,7 +352,7 @@ pci_acpi_scan_root(struct acpi_device *device, int domain, int bus)
 	pxm = acpi_get_pxm(controller->acpi_handle);
 #ifdef CONFIG_NUMA
 	if (pxm >= 0)
-		controller->node = pxm_to_nid_map[pxm];
+		controller->node = pxm_to_node(pxm);
 #endif
 
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_window,
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 30988dfbddff..93577abae36d 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -139,7 +139,7 @@ static int __init pxm_to_nasid(int pxm)
 	int i;
 	int nid;
 
-	nid = pxm_to_nid_map[pxm];
+	nid = pxm_to_node(pxm);
 	for (i = 0; i < num_node_memblks; i++) {
 		if (node_memblk[i].nid == nid) {
 			return NASID_GET(node_memblk[i].start_paddr);
@@ -704,7 +704,7 @@ void __init build_cnode_tables(void)
 	 * cnode == node for all C & M bricks.
 	 */
 	for_each_online_node(node) {
-		nasid = pxm_to_nasid(nid_to_pxm_map[node]);
+		nasid = pxm_to_nasid(node_to_pxm(node));
 		sn_cnodeid_to_nasid[node] = nasid;
 		physical_node_map[nasid] = node;
 	}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 474df22c6ed2..502fce65e96a 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -30,7 +30,6 @@
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
-static nodemask_t nodes_found __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
 static int found_add_area __initdata;
@@ -38,33 +37,14 @@ int hotadd_percent __initdata = 0;
 #ifndef RESERVE_HOTADD
 #define hotadd_percent 0	/* Ignore all settings */
 #endif
-static u8 pxm2node[256] = { [0 ... 255] = 0xff };
 
 /* Too small nodes confuse the VM badly. Usually they result
    from BIOS bugs. */
 #define NODE_MIN_SIZE (4*1024*1024)
 
-static int node_to_pxm(int n);
-
-int pxm_to_node(int pxm)
-{
-	if ((unsigned)pxm >= 256)
-		return -1;
-	/* Extend 0xff to (int)-1 */
-	return (signed char)pxm2node[pxm];
-}
-
 static __init int setup_node(int pxm)
 {
-	unsigned node = pxm2node[pxm];
-	if (node == 0xff) {
-		if (nodes_weight(nodes_found) >= MAX_NUMNODES)
-			return -1;
-		node = first_unset_node(nodes_found); 
-		node_set(node, nodes_found);
-		pxm2node[pxm] = node;
-	}
-	return pxm2node[pxm];
+	return acpi_map_pxm_to_node(pxm);
 }
 
 static __init int conflicting_nodes(unsigned long start, unsigned long end)
@@ -440,17 +420,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	return 0;
 }
 
-static int node_to_pxm(int n)
-{
-       int i;
-       if (pxm2node[n] == n)
-               return n;
-       for (i = 0; i < 256; i++)
-               if (pxm2node[i] == n)
-                       return i;
-       return 0;
-}
-
 void __init srat_reserve_add_area(int nodeid)
 {
 	if (found_add_area && nodes_add[nodeid].end) {
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index c24652d31bf9..230c53852231 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -162,7 +162,7 @@ config ACPI_THERMAL
 config ACPI_NUMA
 	bool "NUMA support"
 	depends on NUMA
-	depends on (IA64 || X86_64)
+	depends on (X86 || IA64)
 	default y if IA64_GENERIC || IA64_SGI_SN2
 
 config ACPI_ASUS
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 64b98e82feb7..e2c1a16078c9 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -36,12 +36,60 @@
 #define _COMPONENT	ACPI_NUMA
 ACPI_MODULE_NAME("numa")
 
+static nodemask_t nodes_found_map = NODE_MASK_NONE;
+#define PXM_INVAL	-1
+#define NID_INVAL	-1
+
+/* maps to convert between proximity domain and logical node ID */
+int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS]
+				= { [0 ... MAX_PXM_DOMAINS - 1] = NID_INVAL };
+int __cpuinitdata node_to_pxm_map[MAX_NUMNODES]
+				= { [0 ... MAX_NUMNODES - 1] = PXM_INVAL };
+
 extern int __init acpi_table_parse_madt_family(enum acpi_table_id id,
 					       unsigned long madt_size,
 					       int entry_id,
 					       acpi_madt_entry_handler handler,
 					       unsigned int max_entries);
 
+int __cpuinit pxm_to_node(int pxm)
+{
+	if (pxm < 0)
+		return NID_INVAL;
+	return pxm_to_node_map[pxm];
+}
+
+int __cpuinit node_to_pxm(int node)
+{
+	if (node < 0)
+		return PXM_INVAL;
+	return node_to_pxm_map[node];
+}
+
+int __cpuinit acpi_map_pxm_to_node(int pxm)
+{
+	int node = pxm_to_node_map[pxm];
+
+	if (node < 0){
+		if (nodes_weight(nodes_found_map) >= MAX_NUMNODES)
+			return NID_INVAL;
+		node = first_unset_node(nodes_found_map);
+		pxm_to_node_map[pxm] = node;
+		node_to_pxm_map[node] = pxm;
+		node_set(node, nodes_found_map);
+	}
+
+	return node;
+}
+
+void __cpuinit acpi_unmap_pxm_to_node(int node)
+{
+	int pxm = node_to_pxm_map[node];
+	pxm_to_node_map[pxm] = NID_INVAL;
+	node_to_pxm_map[node] = PXM_INVAL;
+	node_clear(node, nodes_found_map);
+}
+
 void __init acpi_table_print_srat_entry(acpi_table_entry_header * header)
 {
 
diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h
new file mode 100644
index 000000000000..1049f2a0a6db
--- /dev/null
+++ b/include/acpi/acpi_numa.h
@@ -0,0 +1,23 @@
+#ifndef __ACPI_NUMA_H
+#define __ACPI_NUMA_H
+
+#ifdef CONFIG_ACPI_NUMA
+#include <linux/kernel.h>
+
+/* Proximity bitmap length */
+#if MAX_NUMNODES > 256
+#define MAX_PXM_DOMAINS MAX_NUMNODES
+#else
+#define MAX_PXM_DOMAINS (256) /* Old pxm spec is defined 8 bit */
+#endif
+
+extern int __cpuinitdata pxm_to_node_map[MAX_PXM_DOMAINS];
+extern int __cpuinitdata node_to_pxm_map[MAX_NUMNODES];
+
+extern int __cpuinit pxm_to_node(int);
+extern int __cpuinit node_to_pxm(int);
+extern int __cpuinit acpi_map_pxm_to_node(int);
+extern void __cpuinit acpi_unmap_pxm_to_node(int);
+
+#endif				/* CONFIG_ACPI_NUMA */
+#endif				/* __ACP_NUMA_H */
diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h
index 1cc92fe02503..933ff11ece15 100644
--- a/include/asm-x86_64/numa.h
+++ b/include/asm-x86_64/numa.h
@@ -8,7 +8,6 @@ struct bootnode {
 };
 
 extern int compute_hash_shift(struct bootnode *nodes, int numnodes);
-extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 1cf0b91d05bd..90d6df1551ed 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -37,6 +37,7 @@
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
+#include <acpi/acpi_numa.h>
 #include <asm/acpi.h>
 
 
@@ -407,10 +408,18 @@ void acpi_table_print_madt_entry (acpi_table_entry_header *madt);
 void acpi_table_print_srat_entry (acpi_table_entry_header *srat);
 
 /* the following four functions are architecture-dependent */
+#ifdef CONFIG_HAVE_ARCH_PARSE_SRAT
+#define NR_NODE_MEMBLKS MAX_NUMNODES
+#define acpi_numa_slit_init(slit) do {} while (0)
+#define acpi_numa_processor_affinity_init(pa) do {} while (0)
+#define acpi_numa_memory_affinity_init(ma) do {} while (0)
+#define acpi_numa_arch_fixup() do {} while (0)
+#else
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 void acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa);
 void acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma);
 void acpi_numa_arch_fixup(void);
+#endif
 
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
-- 
cgit v1.2.3


From 929f97276bcf7f4a95272ed08a85339b98ba210d Mon Sep 17 00:00:00 2001
From: Dean Nelson <dcn@sgi.com>
Date: Fri, 23 Jun 2006 02:03:21 -0700
Subject: [PATCH] change gen_pool allocator to not touch managed memory

Modify the gen_pool allocator (lib/genalloc.c) to utilize a bitmap scheme
instead of the buddy scheme.  The purpose of this change is to eliminate
the touching of the actual memory being allocated.

Since the change modifies the interface, a change to the uncached allocator
(arch/ia64/kernel/uncached.c) is also required.

Both Andrey Volkov and Jes Sorenson have expressed a desire that the
gen_pool allocator not write to the memory being managed. See the
following:

  http://marc.theaimsgroup.com/?l=linux-kernel&m=113518602713125&w=2
  http://marc.theaimsgroup.com/?l=linux-kernel&m=113533568827916&w=2

Signed-off-by: Dean Nelson <dcn@sgi.com>
Cc: Andrey Volkov <avolkov@varma-el.com>
Acked-by: Jes Sorensen <jes@trained-monkey.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/uncached.c     | 200 +++++++++++++++---------------
 arch/ia64/sn/kernel/sn2/cache.c |  15 ++-
 include/linux/genalloc.h        |  35 +++---
 lib/genalloc.c                  | 263 ++++++++++++++++++----------------------
 4 files changed, 252 insertions(+), 261 deletions(-)

(limited to 'arch/ia64/sn')

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index fcd2bad0286f..5f03b9e524dd 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2001-2005 Silicon Graphics, Inc.  All rights reserved.
+ * Copyright (C) 2001-2006 Silicon Graphics, Inc.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of version 2 of the GNU General Public License
@@ -29,15 +29,8 @@
 #include <asm/tlbflush.h>
 #include <asm/sn/arch.h>
 
-#define DEBUG	0
 
-#if DEBUG
-#define dprintk			printk
-#else
-#define dprintk(x...)		do { } while (0)
-#endif
-
-void __init efi_memmap_walk_uc (efi_freemem_callback_t callback);
+extern void __init efi_memmap_walk_uc(efi_freemem_callback_t, void *);
 
 #define MAX_UNCACHED_GRANULES	5
 static int allocated_granules;
@@ -60,6 +53,7 @@ static void uncached_ipi_visibility(void *data)
 static void uncached_ipi_mc_drain(void *data)
 {
 	int status;
+
 	status = ia64_pal_mc_drain();
 	if (status)
 		printk(KERN_WARNING "ia64_pal_mc_drain() failed with %i on "
@@ -67,30 +61,35 @@ static void uncached_ipi_mc_drain(void *data)
 }
 
 
-static unsigned long
-uncached_get_new_chunk(struct gen_pool *poolp)
+/*
+ * Add a new chunk of uncached memory pages to the specified pool.
+ *
+ * @pool: pool to add new chunk of uncached memory to
+ * @nid: node id of node to allocate memory from, or -1
+ *
+ * This is accomplished by first allocating a granule of cached memory pages
+ * and then converting them to uncached memory pages.
+ */
+static int uncached_add_chunk(struct gen_pool *pool, int nid)
 {
 	struct page *page;
-	void *tmp;
 	int status, i;
-	unsigned long addr, node;
+	unsigned long c_addr, uc_addr;
 
 	if (allocated_granules >= MAX_UNCACHED_GRANULES)
-		return 0;
+		return -1;
+
+	/* attempt to allocate a granule's worth of cached memory pages */
 
-	node = poolp->private;
-	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO,
+	page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	if (!page)
+		return -1;
 
-	dprintk(KERN_INFO "get_new_chunk page %p, addr %lx\n",
-		page, (unsigned long)(page-vmem_map) << PAGE_SHIFT);
+	/* convert the memory pages from cached to uncached */
 
-	/*
-	 * Do magic if no mem on local node! XXX
-	 */
-	if (!page)
-		return 0;
-	tmp = page_address(page);
+	c_addr = (unsigned long)page_address(page);
+	uc_addr = c_addr - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
 
 	/*
 	 * There's a small race here where it's possible for someone to
@@ -100,76 +99,90 @@ uncached_get_new_chunk(struct gen_pool *poolp)
 	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
 		SetPageUncached(&page[i]);
 
-	flush_tlb_kernel_range(tmp, tmp + IA64_GRANULE_SIZE);
+	flush_tlb_kernel_range(uc_addr, uc_adddr + IA64_GRANULE_SIZE);
 
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
-
-	dprintk(KERN_INFO "pal_prefetch_visibility() returns %i on cpu %i\n",
-		status, raw_smp_processor_id());
-
 	if (!status) {
 		status = smp_call_function(uncached_ipi_visibility, NULL, 0, 1);
 		if (status)
-			printk(KERN_WARNING "smp_call_function failed for "
-			       "uncached_ipi_visibility! (%i)\n", status);
+			goto failed;
 	}
 
+	preempt_disable();
+
 	if (ia64_platform_is("sn2"))
-		sn_flush_all_caches((unsigned long)tmp, IA64_GRANULE_SIZE);
+		sn_flush_all_caches(uc_addr, IA64_GRANULE_SIZE);
 	else
-		flush_icache_range((unsigned long)tmp,
-				   (unsigned long)tmp+IA64_GRANULE_SIZE);
+		flush_icache_range(uc_addr, uc_addr + IA64_GRANULE_SIZE);
+
+	/* flush the just introduced uncached translation from the TLB */
+	local_flush_tlb_all();
+
+	preempt_enable();
 
 	ia64_pal_mc_drain();
 	status = smp_call_function(uncached_ipi_mc_drain, NULL, 0, 1);
 	if (status)
-		printk(KERN_WARNING "smp_call_function failed for "
-		       "uncached_ipi_mc_drain! (%i)\n", status);
+		goto failed;
 
-	addr = (unsigned long)tmp - PAGE_OFFSET + __IA64_UNCACHED_OFFSET;
+	/*
+	 * The chunk of memory pages has been converted to uncached so now we
+	 * can add it to the pool.
+	 */
+	status = gen_pool_add(pool, uc_addr, IA64_GRANULE_SIZE, nid);
+	if (status)
+		goto failed;
 
 	allocated_granules++;
-	return addr;
+	return 0;
+
+	/* failed to convert or add the chunk so give it back to the kernel */
+failed:
+	for (i = 0; i < (IA64_GRANULE_SIZE / PAGE_SIZE); i++)
+		ClearPageUncached(&page[i]);
+
+	free_pages(c_addr, IA64_GRANULE_SHIFT-PAGE_SHIFT);
+	return -1;
 }
 
 
 /*
  * uncached_alloc_page
  *
+ * @starting_nid: node id of node to start with, or -1
+ *
  * Allocate 1 uncached page. Allocates on the requested node. If no
  * uncached pages are available on the requested node, roundrobin starting
- * with higher nodes.
+ * with the next higher node.
  */
-unsigned long
-uncached_alloc_page(int nid)
+unsigned long uncached_alloc_page(int starting_nid)
 {
-	unsigned long maddr;
+	unsigned long uc_addr;
+	struct gen_pool *pool;
+	int nid;
 
-	maddr = gen_pool_alloc(uncached_pool[nid], PAGE_SIZE);
+	if (unlikely(starting_nid >= MAX_NUMNODES))
+		return 0;
 
-	dprintk(KERN_DEBUG "uncached_alloc_page returns %lx on node %i\n",
-		maddr, nid);
+	if (starting_nid < 0)
+		starting_nid = numa_node_id();
+	nid = starting_nid;
 
-	/*
-	 * If no memory is availble on our local node, try the
-	 * remaining nodes in the system.
-	 */
-	if (!maddr) {
-		int i;
-
-		for (i = MAX_NUMNODES - 1; i >= 0; i--) {
-			if (i == nid || !node_online(i))
-				continue;
-			maddr = gen_pool_alloc(uncached_pool[i], PAGE_SIZE);
-			dprintk(KERN_DEBUG "uncached_alloc_page alternate search "
-				"returns %lx on node %i\n", maddr, i);
-			if (maddr) {
-				break;
-			}
-		}
-	}
+	do {
+		if (!node_online(nid))
+			continue;
+		pool = uncached_pool[nid];
+		if (pool == NULL)
+			continue;
+		do {
+			uc_addr = gen_pool_alloc(pool, PAGE_SIZE);
+			if (uc_addr != 0)
+				return uc_addr;
+		} while (uncached_add_chunk(pool, nid) == 0);
+
+	} while ((nid = (nid + 1) % MAX_NUMNODES) != starting_nid);
 
-	return maddr;
+	return 0;
 }
 EXPORT_SYMBOL(uncached_alloc_page);
 
@@ -177,21 +190,22 @@ EXPORT_SYMBOL(uncached_alloc_page);
 /*
  * uncached_free_page
  *
+ * @uc_addr: uncached address of page to free
+ *
  * Free a single uncached page.
  */
-void
-uncached_free_page(unsigned long maddr)
+void uncached_free_page(unsigned long uc_addr)
 {
-	int node;
-
-	node = paddr_to_nid(maddr - __IA64_UNCACHED_OFFSET);
+	int nid = paddr_to_nid(uc_addr - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pool[nid];
 
-	dprintk(KERN_DEBUG "uncached_free_page(%lx) on node %i\n", maddr, node);
+	if (unlikely(pool == NULL))
+		return;
 
-	if ((maddr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
-		panic("uncached_free_page invalid address %lx\n", maddr);
+	if ((uc_addr & (0XFUL << 60)) != __IA64_UNCACHED_OFFSET)
+		panic("uncached_free_page invalid address %lx\n", uc_addr);
 
-	gen_pool_free(uncached_pool[node], maddr, PAGE_SIZE);
+	gen_pool_free(pool, uc_addr, PAGE_SIZE);
 }
 EXPORT_SYMBOL(uncached_free_page);
 
@@ -199,43 +213,39 @@ EXPORT_SYMBOL(uncached_free_page);
 /*
  * uncached_build_memmap,
  *
+ * @uc_start: uncached starting address of a chunk of uncached memory
+ * @uc_end: uncached ending address of a chunk of uncached memory
+ * @arg: ignored, (NULL argument passed in on call to efi_memmap_walk_uc())
+ *
  * Called at boot time to build a map of pages that can be used for
  * memory special operations.
  */
-static int __init
-uncached_build_memmap(unsigned long start, unsigned long end, void *arg)
+static int __init uncached_build_memmap(unsigned long uc_start,
+					unsigned long uc_end, void *arg)
 {
-	long length = end - start;
-	int node;
-
-	dprintk(KERN_ERR "uncached_build_memmap(%lx %lx)\n", start, end);
+	int nid = paddr_to_nid(uc_start - __IA64_UNCACHED_OFFSET);
+	struct gen_pool *pool = uncached_pool[nid];
+	size_t size = uc_end - uc_start;
 
 	touch_softlockup_watchdog();
-	memset((char *)start, 0, length);
 
-	node = paddr_to_nid(start - __IA64_UNCACHED_OFFSET);
-
-	for (; start < end ; start += PAGE_SIZE) {
-		dprintk(KERN_INFO "sticking %lx into the pool!\n", start);
-		gen_pool_free(uncached_pool[node], start, PAGE_SIZE);
+	if (pool != NULL) {
+		memset((char *)uc_start, 0, size);
+		(void) gen_pool_add(pool, uc_start, size, nid);
 	}
-
 	return 0;
 }
 
 
-static int __init uncached_init(void) {
-	int i;
+static int __init uncached_init(void)
+{
+	int nid;
 
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (!node_online(i))
-			continue;
-		uncached_pool[i] = gen_pool_create(0, IA64_GRANULE_SHIFT,
-						   &uncached_get_new_chunk, i);
+	for_each_online_node(nid) {
+		uncached_pool[nid] = gen_pool_create(PAGE_SHIFT, nid);
 	}
 
-	efi_memmap_walk_uc(uncached_build_memmap);
-
+	efi_memmap_walk_uc(uncached_build_memmap, NULL);
 	return 0;
 }
 
diff --git a/arch/ia64/sn/kernel/sn2/cache.c b/arch/ia64/sn/kernel/sn2/cache.c
index bc3cfa17cd0f..2862cb33026d 100644
--- a/arch/ia64/sn/kernel/sn2/cache.c
+++ b/arch/ia64/sn/kernel/sn2/cache.c
@@ -3,11 +3,12 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  * 
- * Copyright (C) 2001-2003 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2001-2003, 2006 Silicon Graphics, Inc. All rights reserved.
  *
  */
 #include <linux/module.h>
 #include <asm/pgalloc.h>
+#include <asm/sn/arch.h>
 
 /**
  * sn_flush_all_caches - flush a range of address from all caches (incl. L4)
@@ -17,18 +18,24 @@
  * Flush a range of addresses from all caches including L4. 
  * All addresses fully or partially contained within 
  * @flush_addr to @flush_addr + @bytes are flushed
- * from the all caches.
+ * from all caches.
  */
 void
 sn_flush_all_caches(long flush_addr, long bytes)
 {
-	flush_icache_range(flush_addr, flush_addr+bytes);
+	unsigned long addr = flush_addr;
+
+	/* SHub1 requires a cached address */
+	if (is_shub1() && (addr & RGN_BITS) == RGN_BASE(RGN_UNCACHED))
+		addr = (addr - RGN_BASE(RGN_UNCACHED)) + RGN_BASE(RGN_KERNEL);
+
+	flush_icache_range(addr, addr + bytes);
 	/*
 	 * The last call may have returned before the caches
 	 * were actually flushed, so we call it again to make
 	 * sure.
 	 */
-	flush_icache_range(flush_addr, flush_addr+bytes);
+	flush_icache_range(addr, addr + bytes);
 	mb();
 }
 EXPORT_SYMBOL(sn_flush_all_caches);
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 7fd0576a4454..690c42803d2e 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -4,37 +4,32 @@
  * Uses for this includes on-device special memory, uncached memory
  * etc.
  *
- * This code is based on the buddy allocator found in the sym53c8xx_2
- * driver, adapted for general purpose use.
- *
  * This source code is licensed under the GNU General Public License,
  * Version 2.  See the file COPYING for more details.
  */
 
-#include <linux/spinlock.h>
 
-#define ALLOC_MIN_SHIFT		5 /* 32 bytes minimum */
 /*
- *  Link between free memory chunks of a given size.
+ *  General purpose special memory pool descriptor.
  */
-struct gen_pool_link {
-	struct gen_pool_link *next;
+struct gen_pool {
+	rwlock_t lock;
+	struct list_head chunks;	/* list of chunks in this pool */
+	int min_alloc_order;		/* minimum allocation order */
 };
 
 /*
- *  Memory pool descriptor.
+ *  General purpose special memory pool chunk descriptor.
  */
-struct gen_pool {
+struct gen_pool_chunk {
 	spinlock_t lock;
-	unsigned long (*get_new_chunk)(struct gen_pool *);
-	struct gen_pool *next;
-	struct gen_pool_link *h;
-	unsigned long private;
-	int max_chunk_shift;
+	struct list_head next_chunk;	/* next chunk in pool */
+	unsigned long start_addr;	/* starting address of memory chunk */
+	unsigned long end_addr;		/* ending address of memory chunk */
+	unsigned long bits[0];		/* bitmap for allocating memory chunk */
 };
 
-unsigned long gen_pool_alloc(struct gen_pool *poolp, int size);
-void gen_pool_free(struct gen_pool *mp, unsigned long ptr, int size);
-struct gen_pool *gen_pool_create(int nr_chunks, int max_chunk_shift,
-				 unsigned long (*fp)(struct gen_pool *),
-				 unsigned long data);
+extern struct gen_pool *gen_pool_create(int, int);
+extern int gen_pool_add(struct gen_pool *, unsigned long, size_t, int);
+extern unsigned long gen_pool_alloc(struct gen_pool *, size_t);
+extern void gen_pool_free(struct gen_pool *, unsigned long, size_t);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 9ce0a6a3b85a..71338b48e889 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -4,10 +4,6 @@
  * Uses for this includes on-device special memory, uncached memory
  * etc.
  *
- * This code is based on the buddy allocator found in the sym53c8xx_2
- * driver Copyright (C) 1999-2001  Gerard Roudier <groudier@free.fr>,
- * and adapted for general purpose use.
- *
  * Copyright 2005 (C) Jes Sorensen <jes@trained-monkey.org>
  *
  * This source code is licensed under the GNU General Public License,
@@ -15,172 +11,155 @@
  */
 
 #include <linux/module.h>
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
 #include <linux/genalloc.h>
 
-#include <asm/page.h>
-
 
-struct gen_pool *gen_pool_create(int nr_chunks, int max_chunk_shift,
-				 unsigned long (*fp)(struct gen_pool *),
-				 unsigned long data)
+/*
+ * Create a new special memory pool.
+ *
+ * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
+ * @nid: node id of the node the pool structure should be allocated on, or -1
+ */
+struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 {
-	struct gen_pool *poolp;
-	unsigned long tmp;
-	int i;
-
-	/*
-	 * This is really an arbitrary limit, +10 is enough for
-	 * IA64_GRANULE_SHIFT, aka 16MB. If anyone needs a large limit
-	 * this can be increased without problems.
-	 */
-	if ((max_chunk_shift > (PAGE_SHIFT + 10)) ||
-	    ((max_chunk_shift < ALLOC_MIN_SHIFT) && max_chunk_shift))
-		return NULL;
-
-	if (!max_chunk_shift)
-		max_chunk_shift = PAGE_SHIFT;
-
-	poolp = kmalloc(sizeof(struct gen_pool), GFP_KERNEL);
-	if (!poolp)
-		return NULL;
-	memset(poolp, 0, sizeof(struct gen_pool));
-	poolp->h = kmalloc(sizeof(struct gen_pool_link) *
-			   (max_chunk_shift - ALLOC_MIN_SHIFT + 1),
-			   GFP_KERNEL);
-	if (!poolp->h) {
-		printk(KERN_WARNING "gen_pool_alloc() failed to allocate\n");
-		kfree(poolp);
-		return NULL;
-	}
-	memset(poolp->h, 0, sizeof(struct gen_pool_link) *
-	       (max_chunk_shift - ALLOC_MIN_SHIFT + 1));
-
-	spin_lock_init(&poolp->lock);
-	poolp->get_new_chunk = fp;
-	poolp->max_chunk_shift = max_chunk_shift;
-	poolp->private = data;
-
-	for (i = 0; i < nr_chunks; i++) {
-		tmp = poolp->get_new_chunk(poolp);
-		printk(KERN_INFO "allocated %lx\n", tmp);
-		if (!tmp)
-			break;
-		gen_pool_free(poolp, tmp, (1 << poolp->max_chunk_shift));
-	}
+	struct gen_pool *pool;
 
-	return poolp;
+	pool = kmalloc_node(sizeof(struct gen_pool), GFP_KERNEL, nid);
+	if (pool != NULL) {
+		rwlock_init(&pool->lock);
+		INIT_LIST_HEAD(&pool->chunks);
+		pool->min_alloc_order = min_alloc_order;
+	}
+	return pool;
 }
 EXPORT_SYMBOL(gen_pool_create);
 
 
 /*
- *  Simple power of two buddy-like generic allocator.
- *  Provides naturally aligned memory chunks.
+ * Add a new chunk of memory to the specified pool.
+ *
+ * @pool: pool to add new memory chunk to
+ * @addr: starting address of memory chunk to add to pool
+ * @size: size in bytes of the memory chunk to add to pool
+ * @nid: node id of the node the chunk structure and bitmap should be
+ *       allocated on, or -1
  */
-unsigned long gen_pool_alloc(struct gen_pool *poolp, int size)
+int gen_pool_add(struct gen_pool *pool, unsigned long addr, size_t size,
+		 int nid)
 {
-	int j, i, s, max_chunk_size;
-	unsigned long a, flags;
-	struct gen_pool_link *h = poolp->h;
+	struct gen_pool_chunk *chunk;
+	int nbits = size >> pool->min_alloc_order;
+	int nbytes = sizeof(struct gen_pool_chunk) +
+				(nbits + BITS_PER_BYTE - 1) / BITS_PER_BYTE;
 
-	max_chunk_size = 1 << poolp->max_chunk_shift;
+	chunk = kmalloc_node(nbytes, GFP_KERNEL, nid);
+	if (unlikely(chunk == NULL))
+		return -1;
 
-	if (size > max_chunk_size)
-		return 0;
+	memset(chunk, 0, nbytes);
+	spin_lock_init(&chunk->lock);
+	chunk->start_addr = addr;
+	chunk->end_addr = addr + size;
 
-	size = max(size, 1 << ALLOC_MIN_SHIFT);
-	i = fls(size - 1);
-	s = 1 << i;
-	j = i -= ALLOC_MIN_SHIFT;
-
-	spin_lock_irqsave(&poolp->lock, flags);
-	while (!h[j].next) {
-		if (s == max_chunk_size) {
-			struct gen_pool_link *ptr;
-			spin_unlock_irqrestore(&poolp->lock, flags);
-			ptr = (struct gen_pool_link *)poolp->get_new_chunk(poolp);
-			spin_lock_irqsave(&poolp->lock, flags);
-			h[j].next = ptr;
-			if (h[j].next)
-				h[j].next->next = NULL;
-			break;
-		}
-		j++;
-		s <<= 1;
-	}
-	a = (unsigned long) h[j].next;
-	if (a) {
-		h[j].next = h[j].next->next;
-		/*
-		 * This should be split into a seperate function doing
-		 * the chunk split in order to support custom
-		 * handling memory not physically accessible by host
-		 */
-		while (j > i) {
-			j -= 1;
-			s >>= 1;
-			h[j].next = (struct gen_pool_link *) (a + s);
-			h[j].next->next = NULL;
-		}
-	}
-	spin_unlock_irqrestore(&poolp->lock, flags);
-	return a;
+	write_lock(&pool->lock);
+	list_add(&chunk->next_chunk, &pool->chunks);
+	write_unlock(&pool->lock);
+
+	return 0;
 }
-EXPORT_SYMBOL(gen_pool_alloc);
+EXPORT_SYMBOL(gen_pool_add);
 
 
 /*
- *  Counter-part of the generic allocator.
+ * Allocate the requested number of bytes from the specified pool.
+ * Uses a first-fit algorithm.
+ *
+ * @pool: pool to allocate from
+ * @size: number of bytes to allocate from the pool
  */
-void gen_pool_free(struct gen_pool *poolp, unsigned long ptr, int size)
+unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 {
-	struct gen_pool_link *q;
-	struct gen_pool_link *h = poolp->h;
-	unsigned long a, b, flags;
-	int i, s, max_chunk_size;
-
-	max_chunk_size = 1 << poolp->max_chunk_shift;
+	struct list_head *_chunk;
+	struct gen_pool_chunk *chunk;
+	unsigned long addr, flags;
+	int order = pool->min_alloc_order;
+	int nbits, bit, start_bit, end_bit;
 
-	if (size > max_chunk_size)
-		return;
-
-	size = max(size, 1 << ALLOC_MIN_SHIFT);
-	i = fls(size - 1);
-	s = 1 << i;
-	i -= ALLOC_MIN_SHIFT;
-
-	a = ptr;
+	if (size == 0)
+		return 0;
 
-	spin_lock_irqsave(&poolp->lock, flags);
-	while (1) {
-		if (s == max_chunk_size) {
-			((struct gen_pool_link *)a)->next = h[i].next;
-			h[i].next = (struct gen_pool_link *)a;
-			break;
+	nbits = (size + (1UL << order) - 1) >> order;
+
+	read_lock(&pool->lock);
+	list_for_each(_chunk, &pool->chunks) {
+		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+
+		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
+		end_bit -= nbits + 1;
+
+		spin_lock_irqsave(&chunk->lock, flags);
+		bit = -1;
+		while (bit + 1 < end_bit) {
+			bit = find_next_zero_bit(chunk->bits, end_bit, bit + 1);
+			if (bit >= end_bit)
+				break;
+
+			start_bit = bit;
+			if (nbits > 1) {
+				bit = find_next_bit(chunk->bits, bit + nbits,
+							bit + 1);
+				if (bit - start_bit < nbits)
+					continue;
+			}
+
+			addr = chunk->start_addr +
+					    ((unsigned long)start_bit << order);
+			while (nbits--)
+				__set_bit(start_bit++, &chunk->bits);
+			spin_unlock_irqrestore(&chunk->lock, flags);
+			read_unlock(&pool->lock);
+			return addr;
 		}
-		b = a ^ s;
-		q = &h[i];
+		spin_unlock_irqrestore(&chunk->lock, flags);
+	}
+	read_unlock(&pool->lock);
+	return 0;
+}
+EXPORT_SYMBOL(gen_pool_alloc);
 
-		while (q->next && q->next != (struct gen_pool_link *)b)
-			q = q->next;
 
-		if (!q->next) {
-			((struct gen_pool_link *)a)->next = h[i].next;
-			h[i].next = (struct gen_pool_link *)a;
+/*
+ * Free the specified memory back to the specified pool.
+ *
+ * @pool: pool to free to
+ * @addr: starting address of memory to free back to pool
+ * @size: size in bytes of memory to free
+ */
+void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
+{
+	struct list_head *_chunk;
+	struct gen_pool_chunk *chunk;
+	unsigned long flags;
+	int order = pool->min_alloc_order;
+	int bit, nbits;
+
+	nbits = (size + (1UL << order) - 1) >> order;
+
+	read_lock(&pool->lock);
+	list_for_each(_chunk, &pool->chunks) {
+		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
+
+		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
+			BUG_ON(addr + size > chunk->end_addr);
+			spin_lock_irqsave(&chunk->lock, flags);
+			bit = (addr - chunk->start_addr) >> order;
+			while (nbits--)
+				__clear_bit(bit++, &chunk->bits);
+			spin_unlock_irqrestore(&chunk->lock, flags);
 			break;
 		}
-		q->next = q->next->next;
-		a = a & b;
-		s <<= 1;
-		i++;
 	}
-	spin_unlock_irqrestore(&poolp->lock, flags);
+	BUG_ON(nbits > 0);
+	read_unlock(&pool->lock);
 }
 EXPORT_SYMBOL(gen_pool_free);
-- 
cgit v1.2.3