Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus-1

* 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6: (9356 commits) [media] rc: update for bitop name changes fs: simplify iget & friends fs: pull inode->i_lock up out of writeback_single_inode fs: rename inode_lock to inode_hash_lock fs: move i_wb_list out from under inode_lock fs: move i_sb_list out from under inode_lock fs: remove inode_lock from iput_final and prune_icache fs: Lock the inode LRU list separately fs: factor inode disposal fs: protect inode->i_state with inode->i_lock lib, arch: add filter argument to show_mem and fix private implementations SLUB: Write to per cpu data when allocating it slub: Fix debugobjects with lockless fastpath autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd() autofs4 - remove autofs4_lock autofs4 - fix d_manage() return on rcu-walk autofs4 - fix autofs4_expire_indirect() traversal autofs4 - fix dentry leak in autofs4_expire_direct() autofs4 - reinstate last used update on access vfs - check non-mountpoint dentry might block in __follow_mount_rcu() ... NOTE! This merge commit was created to fix compilation error. The block tree was merged upstream and removed the 'elv_queue_empty()' function which the new 'mtdswap' driver is using. So a simple merge of the mtd tree with upstream does not compile. And the mtd tree has already be published, so re-basing it is not an option. To fix this unfortunate situation, I had to merge upstream into the mtd-2.6.git tree without committing, put the fixup patch on top of this, and then commit this. The result is that we do not have commits which do not compile. In other words, this merge commit "merges" 3 things: the MTD tree, the upstream tree, and the fixup patch.
author: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2011-03-25 18:41:20 +0300
committer: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2011-03-25 18:41:20 +0300
commit: 7bf7e370d5919112c223a269462cd0b546903829 (patch)
tree: 03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /arch/x86/mm
parent: 68b1a1e786f29c900fa1c516a402e24f0ece622a (diff)
parent: d39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff)
download: linux-7bf7e370d5919112c223a269462cd0b546903829.tar.xz
17 files changed, 1264 insertions, 1247 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 09df2f9a3d69..3e608edf9958 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
 obj-$(CONFIG_AMD_NUMA)		+= amdtopology_64.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
+obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o
 
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index f21962c435ed..0919c26820d4 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -26,9 +26,7 @@
 #include <asm/apic.h>
 #include <asm/amd_nb.h>
 
-static struct bootnode __initdata nodes[8];
 static unsigned char __initdata nodeids[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
 
 static __init int find_northbridge(void)
 {
@@ -51,7 +49,7 @@ static __init int find_northbridge(void)
 		return num;
 	}
 
-	return -1;
+	return -ENOENT;
 }
 
 static __init void early_get_boot_cpu_id(void)
@@ -69,17 +67,18 @@ static __init void early_get_boot_cpu_id(void)
 #endif
 }
 
-int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(void)
 {
-	unsigned long start = PFN_PHYS(start_pfn);
-	unsigned long end = PFN_PHYS(end_pfn);
+	unsigned long start = PFN_PHYS(0);
+	unsigned long end = PFN_PHYS(max_pfn);
 	unsigned numnodes;
 	unsigned long prevbase;
-	int i, nb, found = 0;
+	int i, j, nb;
 	u32 nodeid, reg;
+	unsigned int bits, cores, apicid_base;
 
 	if (!early_pci_allowed())
-		return -1;
+		return -EINVAL;
 
 	nb = find_northbridge();
 	if (nb < 0)
@@ -90,7 +89,7 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 	reg = read_pci_config(0, nb, 0, 0x60);
 	numnodes = ((reg >> 4) & 0xF) + 1;
 	if (numnodes <= 1)
-		return -1;
+		return -ENOENT;
 
 	pr_info("Number of physical nodes %d\n", numnodes);
 
@@ -121,9 +120,9 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 		if ((base >> 8) & 3 || (limit >> 8) & 3) {
 			pr_err("Node %d using interleaving mode %lx/%lx\n",
 			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
-			return -1;
+			return -EINVAL;
 		}
-		if (node_isset(nodeid, nodes_parsed)) {
+		if (node_isset(nodeid, numa_nodes_parsed)) {
 			pr_info("Node %d already present, skipping\n",
 				nodeid);
 			continue;
@@ -160,117 +159,28 @@ int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
 		if (prevbase > base) {
 			pr_err("Node map not sorted %lx,%lx\n",
 			       prevbase, base);
-			return -1;
+			return -EINVAL;
 		}
 
 		pr_info("Node %d MemBase %016lx Limit %016lx\n",
 			nodeid, base, limit);
 
-		found++;
-
-		nodes[nodeid].start = base;
-		nodes[nodeid].end = limit;
-
 		prevbase = base;
-
-		node_set(nodeid, nodes_parsed);
-	}
-
-	if (!found)
-		return -1;
-	return 0;
-}
-
-#ifdef CONFIG_NUMA_EMU
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-
-void __init amd_get_nodes(struct bootnode *physnodes)
-{
-	int i;
-
-	for_each_node_mask(i, nodes_parsed) {
-		physnodes[i].start = nodes[i].start;
-		physnodes[i].end = nodes[i].end;
+		numa_add_memblk(nodeid, base, limit);
+		node_set(nodeid, numa_nodes_parsed);
 	}
-}
-
-static int __init find_node_by_addr(unsigned long addr)
-{
-	int ret = NUMA_NO_NODE;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		if (addr >= nodes[i].start && addr < nodes[i].end) {
-			ret = i;
-			break;
-		}
-	return ret;
-}
 
-/*
- * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
- * setup to represent the physical topology but reflect the emulated
- * environment.  For each emulated node, the real node which it appears on is
- * found and a fake pxm to nid mapping is created which mirrors the actual
- * locality.  node_distance() then represents the correct distances between
- * emulated nodes by using the fake acpi mappings to pxms.
- */
-void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
-{
-	unsigned int bits;
-	unsigned int cores;
-	unsigned int apicid_base = 0;
-	int i;
+	if (!nodes_weight(numa_nodes_parsed))
+		return -ENOENT;
 
+	/*
+	 * We seem to have valid NUMA configuration.  Map apicids to nodes
+	 * using the coreid bits from early_identify_cpu.
+	 */
 	bits = boot_cpu_data.x86_coreid_bits;
 	cores = 1 << bits;
-	early_get_boot_cpu_id();
-	if (boot_cpu_physical_apicid > 0)
-		apicid_base = boot_cpu_physical_apicid;
-
-	for (i = 0; i < nr_nodes; i++) {
-		int index;
-		int nid;
-		int j;
-
-		nid = find_node_by_addr(nodes[i].start);
-		if (nid == NUMA_NO_NODE)
-			continue;
-
-		index = nodeids[nid] << bits;
-		if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
-			for (j = apicid_base; j < cores + apicid_base; j++)
-				fake_apicid_to_node[index + j] = i;
-#ifdef CONFIG_ACPI_NUMA
-		__acpi_map_pxm_to_node(nid, i);
-#endif
-	}
-	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-}
-#endif /* CONFIG_NUMA_EMU */
-
-int __init amd_scan_nodes(void)
-{
-	unsigned int bits;
-	unsigned int cores;
-	unsigned int apicid_base;
-	int i;
-
-	BUG_ON(nodes_empty(nodes_parsed));
-	node_possible_map = nodes_parsed;
-	memnode_shift = compute_hash_shift(nodes, 8, NULL);
-	if (memnode_shift < 0) {
-		pr_err("No NUMA node hash function found. Contact maintainer\n");
-		return -1;
-	}
-	pr_info("Using node hash shift of %d\n", memnode_shift);
-
-	/* use the coreid bits from early_identify_cpu */
-	bits = boot_cpu_data.x86_coreid_bits;
-	cores = (1<<bits);
 	apicid_base = 0;
+
 	/* get the APIC ID of the BSP early for systems with apicid lifting */
 	early_get_boot_cpu_id();
 	if (boot_cpu_physical_apicid > 0) {
@@ -278,17 +188,9 @@ int __init amd_scan_nodes(void)
 		apicid_base = boot_cpu_physical_apicid;
 	}
 
-	for_each_node_mask(i, node_possible_map) {
-		int j;
-
-		memblock_x86_register_active_regions(i,
-				nodes[i].start >> PAGE_SHIFT,
-				nodes[i].end >> PAGE_SHIFT);
+	for_each_node_mask(i, numa_nodes_parsed)
 		for (j = apicid_base; j < cores + apicid_base; j++)
-			apicid_to_node[(i << bits) + j] = i;
-		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	}
+			set_apicid_to_node((i << bits) + j, i);
 
-	numa_init_array();
 	return 0;
 }
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7d90ceb882a4..20e3f8702d1e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,15 +229,14 @@ void vmalloc_sync_all(void)
 	for (address = VMALLOC_START & PMD_MASK;
 	     address >= TASK_SIZE && address < FIXADDR_TOP;
 	     address += PMD_SIZE) {
-
-		unsigned long flags;
 		struct page *page;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			spinlock_t *pgt_lock;
 			pmd_t *ret;
 
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 
 			spin_lock(pgt_lock);
@@ -247,7 +246,7 @@ void vmalloc_sync_all(void)
 			if (!ret)
 				break;
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
@@ -828,6 +827,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	       unsigned long address, unsigned int fault)
 {
 	if (fault & VM_FAULT_OOM) {
+		/* Kernel mode? Handle exceptions or die: */
+		if (!(error_code & PF_USER)) {
+			up_read(&current->mm->mmap_sem);
+			no_context(regs, error_code, address);
+			return;
+		}
+
 		out_of_memory(regs, error_code, address);
 	} else {
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..d4203988504a 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -326,7 +326,7 @@ try_again:
 	if (mm->free_area_cache < len)
 		goto fail;
 
-	/* either no address requested or cant fit in requested address hole */
+	/* either no address requested or can't fit in requested address hole */
 	addr = (mm->free_area_cache - len) & huge_page_mask(h);
 	do {
 		/*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 947f42abe820..286d289b039b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,9 +18,9 @@
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-unsigned long __initdata e820_table_start;
-unsigned long __meminitdata e820_table_end;
-unsigned long __meminitdata e820_table_top;
+unsigned long __initdata pgt_buf_start;
+unsigned long __meminitdata pgt_buf_end;
+unsigned long __meminitdata pgt_buf_top;
 
 int after_bootmem;
 
@@ -33,7 +33,7 @@ int direct_gbpages
 static void __init find_early_table_space(unsigned long end, int use_pse,
 					  int use_gbpages)
 {
-	unsigned long puds, pmds, ptes, tables, start;
+	unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
 	phys_addr_t base;
 
 	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
@@ -65,29 +65,20 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 #ifdef CONFIG_X86_32
 	/* for fixmap */
 	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
 
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-#ifdef CONFIG_X86_32
-	start = 0x7000;
-#else
-	start = 0x8000;
+	good_end = max_pfn_mapped << PAGE_SHIFT;
 #endif
-	base = memblock_find_in_range(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
+
+	base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
 	if (base == MEMBLOCK_ERROR)
 		panic("Cannot find space for the kernel page tables");
 
-	e820_table_start = base >> PAGE_SHIFT;
-	e820_table_end = e820_table_start;
-	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+	pgt_buf_start = base >> PAGE_SHIFT;
+	pgt_buf_end = pgt_buf_start;
+	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 
 	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+		end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
 }
 
 struct map_range {
@@ -279,30 +270,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	load_cr3(swapper_pg_dir);
 #endif
 
-#ifdef CONFIG_X86_64
-	if (!after_bootmem && !start) {
-		pud_t *pud;
-		pmd_t *pmd;
-
-		mmu_cr4_features = read_cr4();
-
-		/*
-		 * _brk_end cannot change anymore, but it and _end may be
-		 * located on different 2M pages. cleanup_highmap(), however,
-		 * can only consider _end when it runs, so destroy any
-		 * mappings beyond _brk_end here.
-		 */
-		pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
-		pmd = pmd_offset(pud, _brk_end - 1);
-		while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
-			pmd_clear(pmd);
-	}
-#endif
 	__flush_tlb_all();
 
-	if (!after_bootmem && e820_table_end > e820_table_start)
-		memblock_x86_reserve_range(e820_table_start << PAGE_SHIFT,
-				 e820_table_end << PAGE_SHIFT, "PGTABLE");
+	if (!after_bootmem && pgt_buf_end > pgt_buf_start)
+		memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT,
+				 pgt_buf_end << PAGE_SHIFT, "PGTABLE");
 
 	if (!after_bootmem)
 		early_memtest(start, end);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index c821074b7f0b..80088f994193 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -62,10 +62,10 @@ bool __read_mostly __vmalloc_start_set = false;
 
 static __init void *alloc_low_page(void)
 {
-	unsigned long pfn = e820_table_end++;
+	unsigned long pfn = pgt_buf_end++;
 	void *adr;
 
-	if (pfn >= e820_table_top)
+	if (pfn >= pgt_buf_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = __va(pfn * PAGE_SIZE);
@@ -163,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
 	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
 	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-	    && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
-		|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
+	    && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
 		pte_t *newpte;
 		int i;
 
@@ -644,8 +644,7 @@ void __init find_low_pfn_range(void)
 }
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-				int acpi, int k8)
+void __init initmem_init(void)
 {
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
@@ -918,7 +917,7 @@ static void mark_nxdata_nx(void)
 {
 	/*
 	 * When this called, init has already been executed and released,
-	 * so everything past _etext sould be NX.
+	 * so everything past _etext should be NX.
 	 */
 	unsigned long start = PFN_ALIGN(_etext);
 	/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 71a59296af80..794233587287 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -51,6 +51,8 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
+#include <asm/uv/uv.h>
+#include <asm/setup.h>
 
 static int __init parse_direct_gbpages_off(char *arg)
 {
@@ -105,18 +107,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 
 	for (address = start; address <= end; address += PGDIR_SIZE) {
 		const pgd_t *pgd_ref = pgd_offset_k(address);
-		unsigned long flags;
 		struct page *page;
 
 		if (pgd_none(*pgd_ref))
 			continue;
 
-		spin_lock_irqsave(&pgd_lock, flags);
+		spin_lock(&pgd_lock);
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			spinlock_t *pgt_lock;
 
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			/* the pgt_lock only for Xen */
 			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 			spin_lock(pgt_lock);
 
@@ -128,7 +130,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 
 			spin_unlock(pgt_lock);
 		}
-		spin_unlock_irqrestore(&pgd_lock, flags);
+		spin_unlock(&pgd_lock);
 	}
 }
 
@@ -293,18 +295,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
  * to the compile time generated pmds. This results in invalid pmds up
  * to the point where we hit the physaddr 0 mapping.
  *
- * We limit the mappings to the region from _text to _end.  _end is
- * rounded up to the 2MB boundary. This catches the invalid pmds as
+ * We limit the mappings to the region from _text to _brk_end.  _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
  * well, as they are located before _text:
  */
 void __init cleanup_highmap(void)
 {
 	unsigned long vaddr = __START_KERNEL_map;
-	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
+	unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
 	pmd_t *pmd = level2_kernel_pgt;
-	pmd_t *last_pmd = pmd + PTRS_PER_PMD;
 
-	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
+	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
 		if (pmd_none(*pmd))
 			continue;
 		if (vaddr < (unsigned long) _text || vaddr > end)
@@ -314,7 +316,7 @@ void __init cleanup_highmap(void)
 
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-	unsigned long pfn = e820_table_end++;
+	unsigned long pfn = pgt_buf_end++;
 	void *adr;
 
 	if (after_bootmem) {
@@ -324,7 +326,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		return adr;
 	}
 
-	if (pfn >= e820_table_top)
+	if (pfn >= pgt_buf_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -333,12 +335,28 @@ static __ref void *alloc_low_page(unsigned long *phys)
 	return adr;
 }
 
+static __ref void *map_low_page(void *virt)
+{
+	void *adr;
+	unsigned long phys, left;
+
+	if (after_bootmem)
+		return virt;
+
+	phys = __pa(virt);
+	left = phys & (PAGE_SIZE - 1);
+	adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
+	adr = (void *)(((unsigned long)adr) | left);
+
+	return adr;
+}
+
 static __ref void unmap_low_page(void *adr)
 {
 	if (after_bootmem)
 		return;
 
-	early_iounmap(adr, PAGE_SIZE);
+	early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
 }
 
 static unsigned long __meminit
@@ -386,15 +404,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
 }
 
 static unsigned long __meminit
-phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
-		pgprot_t prot)
-{
-	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
-
-	return phys_pte_init(pte, address, end, prot);
-}
-
-static unsigned long __meminit
 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 	      unsigned long page_size_mask, pgprot_t prot)
 {
@@ -420,8 +429,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 		if (pmd_val(*pmd)) {
 			if (!pmd_large(*pmd)) {
 				spin_lock(&init_mm.page_table_lock);
-				last_map_addr = phys_pte_update(pmd, address,
+				pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+				last_map_addr = phys_pte_init(pte, address,
 								end, prot);
+				unmap_low_page(pte);
 				spin_unlock(&init_mm.page_table_lock);
 				continue;
 			}
@@ -468,18 +479,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
 }
 
 static unsigned long __meminit
-phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
-		unsigned long page_size_mask, pgprot_t prot)
-{
-	pmd_t *pmd = pmd_offset(pud, 0);
-	unsigned long last_map_addr;
-
-	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
-	__flush_tlb_all();
-	return last_map_addr;
-}
-
-static unsigned long __meminit
 phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 			 unsigned long page_size_mask)
 {
@@ -504,8 +503,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 
 		if (pud_val(*pud)) {
 			if (!pud_large(*pud)) {
-				last_map_addr = phys_pmd_update(pud, addr, end,
+				pmd = map_low_page(pmd_offset(pud, 0));
+				last_map_addr = phys_pmd_init(pmd, addr, end,
 							 page_size_mask, prot);
+				unmap_low_page(pmd);
+				__flush_tlb_all();
 				continue;
 			}
 			/*
@@ -553,17 +555,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
 	return last_map_addr;
 }
 
-static unsigned long __meminit
-phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
-		 unsigned long page_size_mask)
-{
-	pud_t *pud;
-
-	pud = (pud_t *)pgd_page_vaddr(*pgd);
-
-	return phys_pud_init(pud, addr, end, page_size_mask);
-}
-
 unsigned long __meminit
 kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
@@ -587,8 +578,10 @@ kernel_physical_mapping_init(unsigned long start,
 			next = end;
 
 		if (pgd_val(*pgd)) {
-			last_map_addr = phys_pud_update(pgd, __pa(start),
+			pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+			last_map_addr = phys_pud_init(pud, __pa(start),
 						 __pa(end), page_size_mask);
+			unmap_low_page(pud);
 			continue;
 		}
 
@@ -612,10 +605,9 @@ kernel_physical_mapping_init(unsigned long start,
 }
 
 #ifndef CONFIG_NUMA
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-				int acpi, int k8)
+void __init initmem_init(void)
 {
-	memblock_x86_register_active_regions(0, start_pfn, end_pfn);
+	memblock_x86_register_active_regions(0, 0, max_pfn);
 }
 #endif
 
@@ -870,18 +862,18 @@ static struct vm_area_struct gate_vma = {
 	.vm_flags	= VM_READ | VM_EXEC
 };
 
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
 #ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(tsk, TIF_IA32))
+	if (!mm || mm->context.ia32_compat)
 		return NULL;
 #endif
 	return &gate_vma;
 }
 
-int in_gate_area(struct task_struct *task, unsigned long addr)
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
 {
-	struct vm_area_struct *vma = get_gate_vma(task);
+	struct vm_area_struct *vma = get_gate_vma(mm);
 
 	if (!vma)
 		return 0;
@@ -890,11 +882,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
 }
 
 /*
- * Use this when you have no reliable task/vma, typically from interrupt
- * context. It is less reliable than using the task's vma and may give
- * false positives:
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
  */
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
 {
 	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
 }
@@ -908,6 +900,19 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 	return NULL;
 }
 
+#ifdef CONFIG_X86_UV
+#define MIN_MEMORY_BLOCK_SIZE   (1 << SECTION_SIZE_BITS)
+
+unsigned long memory_block_size_bytes(void)
+{
+	if (is_uv_system()) {
+		printk(KERN_INFO "UV: memory block size 2GB\n");
+		return 2UL * 1024 * 1024 * 1024;
+	}
+	return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 /*
  * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ebf6d7887a38..9559d360fde7 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -26,12 +26,50 @@ static __init int numa_setup(char *opt)
 early_param("numa", numa_setup);
 
 /*
- * Which logical CPUs are on which nodes
+ * apicid, cpu, node mappings
  */
+s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 
 /*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+void __cpuinit numa_set_node(int cpu, int node)
+{
+	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+	/* early setting, no percpu area yet */
+	if (cpu_to_node_map) {
+		cpu_to_node_map[cpu] = node;
+		return;
+	}
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+		dump_stack();
+		return;
+	}
+#endif
+	per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+	if (node != NUMA_NO_NODE)
+		set_cpu_numa_node(cpu, node);
+}
+
+void __cpuinit numa_clear_node(int cpu)
+{
+	numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+/*
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
@@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void)
 	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
 
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+void __init numa_init_array(void)
+{
+	int rr, i;
+
+	rr = first_node(node_online_map);
+	for (i = 0; i < nr_cpu_ids; i++) {
+		if (early_cpu_to_node(i) != NUMA_NO_NODE)
+			continue;
+		numa_set_node(i, rr);
+		rr = next_node(rr, node_online_map);
+		if (rr == MAX_NUMNODES)
+			rr = first_node(node_online_map);
+	}
+}
+
+static __init int find_near_online_node(int node)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	for_each_online_node(n) {
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	return best_node;
+}
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+	int cpu;
+	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+
+	BUG_ON(cpu_to_apicid == NULL);
+
+	for_each_possible_cpu(cpu) {
+		int node = numa_cpu_node(cpu);
+
+		if (node == NUMA_NO_NODE)
+			continue;
+		if (!node_online(node))
+			node = find_near_online_node(node);
+		numa_set_node(cpu, node);
+	}
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+# ifndef CONFIG_NUMA_EMU
+void __cpuinit numa_add_cpu(int cpu)
+{
+	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif	/* !CONFIG_NUMA_EMU */
+
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+int __cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+		printk(KERN_WARNING
+			"cpu_to_node(%d): usage too early!\n", cpu);
+		dump_stack();
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	if (!cpu_possible(cpu)) {
+		printk(KERN_WARNING
+			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+		dump_stack();
+		return NUMA_NO_NODE;
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+	int node = early_cpu_to_node(cpu);
+	struct cpumask *mask;
+	char buf[64];
+
+	if (node == NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return NULL;
+	}
+	mask = node_to_cpumask_map[node];
+	if (!mask) {
+		pr_err("node_to_cpumask_map[%i] NULL\n", node);
+		dump_stack();
+		return NULL;
+	}
+
+	cpulist_scnprintf(buf, sizeof(buf), mask);
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+		enable ? "numa_add_cpu" : "numa_remove_cpu",
+		cpu, node, buf);
+	return mask;
+}
+
+# ifndef CONFIG_NUMA_EMU
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	struct cpumask *mask;
+
+	mask = debug_cpumask_set_cpu(cpu, enable);
+	if (!mask)
+		return;
+
+	if (enable)
+		cpumask_set_cpu(cpu, mask);
+	else
+		cpumask_clear_cpu(cpu, mask);
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
+# endif	/* !CONFIG_NUMA_EMU */
+
 /*
  * Returns a pointer to the bitmask of CPUs on Node 'node'.
  */
@@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node)
 	return node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(cpumask_of_node);
-#endif
+
+#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84a3e4c9f277..bde3906420df 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 static unsigned long kva_start_pfn;
 static unsigned long kva_pages;
+
+int __cpuinit numa_cpu_node(int cpu)
+{
+	return apic->x86_32_numa_cpu_node(cpu);
+}
+
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
@@ -346,8 +352,7 @@ static void init_remap_allocator(int nid)
 		(ulong) node_remap_end_vaddr[nid]);
 }
 
-void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
-				int acpi, int k8)
+void __init initmem_init(void)
 {
 	int nid;
 	long kva_target_pfn;
@@ -361,6 +366,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
 	 */
 
 	get_memcfg_numa();
+	numa_init_array();
 
 	kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
 
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 95ea1551eebc..e8c00cc72033 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -13,31 +13,30 @@
 #include <linux/module.h>
 #include <linux/nodemask.h>
 #include <linux/sched.h>
+#include <linux/acpi.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/dma.h>
-#include <asm/numa.h>
 #include <asm/acpi.h>
 #include <asm/amd_nb.h>
 
+#include "numa_internal.h"
+
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-struct memnode memnode;
+nodemask_t numa_nodes_parsed __initdata;
 
-s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
+struct memnode memnode;
 
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
-/*
- * Map cpu index to node index
- */
-DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+static struct numa_meminfo numa_meminfo __initdata;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
 
 /*
  * Given a shift value, try to populate memnodemap[]
@@ -46,16 +45,15 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
  * 0 if memnodmap[] too small (of shift too small)
  * -1 if node overlap or lost ram (shift too big)
  */
-static int __init populate_memnodemap(const struct bootnode *nodes,
-				      int numnodes, int shift, int *nodeids)
+static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
 {
 	unsigned long addr, end;
 	int i, res = -1;
 
 	memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
-	for (i = 0; i < numnodes; i++) {
-		addr = nodes[i].start;
-		end = nodes[i].end;
+	for (i = 0; i < mi->nr_blks; i++) {
+		addr = mi->blk[i].start;
+		end = mi->blk[i].end;
 		if (addr >= end)
 			continue;
 		if ((end >> shift) >= memnodemapsize)
@@ -63,12 +61,7 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
 		do {
 			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
 				return -1;
-
-			if (!nodeids)
-				memnodemap[addr >> shift] = i;
-			else
-				memnodemap[addr >> shift] = nodeids[i];
-
+			memnodemap[addr >> shift] = mi->blk[i].nid;
 			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
@@ -86,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
 
 	addr = 0x8000;
 	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-	nodemap_addr = memblock_find_in_range(addr, max_pfn<<PAGE_SHIFT,
+	nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
 				      nodemap_size, L1_CACHE_BYTES);
 	if (nodemap_addr == MEMBLOCK_ERROR) {
 		printk(KERN_ERR
@@ -106,16 +99,15 @@ static int __init allocate_cachealigned_memnodemap(void)
  * The LSB of all start and end addresses in the node map is the value of the
  * maximum possible shift.
  */
-static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
-					 int numnodes)
+static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
 {
 	int i, nodes_used = 0;
 	unsigned long start, end;
 	unsigned long bitfield = 0, memtop = 0;
 
-	for (i = 0; i < numnodes; i++) {
-		start = nodes[i].start;
-		end = nodes[i].end;
+	for (i = 0; i < mi->nr_blks; i++) {
+		start = mi->blk[i].start;
+		end = mi->blk[i].end;
 		if (start >= end)
 			continue;
 		bitfield |= start;
@@ -131,18 +123,17 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
 	return i;
 }
 
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
-			      int *nodeids)
+static int __init compute_hash_shift(const struct numa_meminfo *mi)
 {
 	int shift;
 
-	shift = extract_lsb_from_nodes(nodes, numnodes);
+	shift = extract_lsb_from_nodes(mi);
 	if (allocate_cachealigned_memnodemap())
 		return -1;
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
-	if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
+	if (populate_memnodemap(mi, shift) != 1) {
 		printk(KERN_INFO "Your memory is not aligned you need to "
 		       "rebuild your kernel with a bigger NODEMAPSIZE "
 		       "shift=%d\n", shift);
@@ -188,6 +179,63 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 	return NULL;
 }
 
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+				     struct numa_meminfo *mi)
+{
+	/* ignore zero length blks */
+	if (start == end)
+		return 0;
+
+	/* whine about and ignore invalid blks */
+	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+		pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
+			   nid, start, end);
+		return 0;
+	}
+
+	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: too many memblk ranges\n");
+		return -EINVAL;
+	}
+
+	mi->blk[mi->nr_blks].start = start;
+	mi->blk[mi->nr_blks].end = end;
+	mi->blk[mi->nr_blks].nid = nid;
+	mi->nr_blks++;
+	return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+	mi->nr_blks--;
+	memmove(&mi->blk[idx], &mi->blk[idx + 1],
+		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
 /* Initialize bootmem allocator for a node */
 void __init
 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
@@ -234,696 +282,386 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	node_set_online(nodeid);
 }
 
-/*
- * There are unfortunately some poorly designed mainboards around that
- * only connect memory to a single CPU. This breaks the 1:1 cpu->node
- * mapping. To avoid this fill in the mapping for all possible CPUs,
- * as the number of CPUs is not known yet. We round robin the existing
- * nodes.
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks.  Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
  */
-void __init numa_init_array(void)
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
-	int rr, i;
+	const u64 low = 0;
+	const u64 high = (u64)max_pfn << PAGE_SHIFT;
+	int i, j, k;
 
-	rr = first_node(node_online_map);
-	for (i = 0; i < nr_cpu_ids; i++) {
-		if (early_cpu_to_node(i) != NUMA_NO_NODE)
-			continue;
-		numa_set_node(i, rr);
-		rr = next_node(rr, node_online_map);
-		if (rr == MAX_NUMNODES)
-			rr = first_node(node_online_map);
-	}
-}
-
-#ifdef CONFIG_NUMA_EMU
-/* Numa emulation */
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
-static char *cmdline __initdata;
+	for (i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi = &mi->blk[i];
 
-void __init numa_emu_cmdline(char *str)
-{
-	cmdline = str;
-}
+		/* make sure all blocks are inside the limits */
+		bi->start = max(bi->start, low);
+		bi->end = min(bi->end, high);
 
-static int __init setup_physnodes(unsigned long start, unsigned long end,
-					int acpi, int amd)
-{
-	int ret = 0;
-	int i;
-
-	memset(physnodes, 0, sizeof(physnodes));
-#ifdef CONFIG_ACPI_NUMA
-	if (acpi)
-		acpi_get_nodes(physnodes, start, end);
-#endif
-#ifdef CONFIG_AMD_NUMA
-	if (amd)
-		amd_get_nodes(physnodes);
-#endif
-	/*
-	 * Basic sanity checking on the physical node map: there may be errors
-	 * if the SRAT or AMD code incorrectly reported the topology or the mem=
-	 * kernel parameter is used.
-	 */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (physnodes[i].start == physnodes[i].end)
-			continue;
-		if (physnodes[i].start > end) {
-			physnodes[i].end = physnodes[i].start;
-			continue;
-		}
-		if (physnodes[i].end < start) {
-			physnodes[i].start = physnodes[i].end;
+		/* and there's no empty block */
+		if (bi->start == bi->end) {
+			numa_remove_memblk_from(i--, mi);
 			continue;
 		}
-		if (physnodes[i].start < start)
-			physnodes[i].start = start;
-		if (physnodes[i].end > end)
-			physnodes[i].end = end;
-		ret++;
-	}
-
-	/*
-	 * If no physical topology was detected, a single node is faked to cover
-	 * the entire address space.
-	 */
-	if (!ret) {
-		physnodes[ret].start = start;
-		physnodes[ret].end = end;
-		ret = 1;
-	}
-	return ret;
-}
-
-static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
-{
-	int i;
-
-	BUG_ON(acpi && amd);
-#ifdef CONFIG_ACPI_NUMA
-	if (acpi)
-		acpi_fake_nodes(nodes, nr_nodes);
-#endif
-#ifdef CONFIG_AMD_NUMA
-	if (amd)
-		amd_fake_nodes(nodes, nr_nodes);
-#endif
-	if (!acpi && !amd)
-		for (i = 0; i < nr_cpu_ids; i++)
-			numa_set_node(i, 0);
-}
-
-/*
- * Setups up nid to range from addr to addr + size.  If the end
- * boundary is greater than max_addr, then max_addr is used instead.
- * The return value is 0 if there is additional memory left for
- * allocation past addr and -1 otherwise.  addr is adjusted to be at
- * the end of the node.
- */
-static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
-{
-	int ret = 0;
-	nodes[nid].start = *addr;
-	*addr += size;
-	if (*addr >= max_addr) {
-		*addr = max_addr;
-		ret = -1;
-	}
-	nodes[nid].end = *addr;
-	node_set(nid, node_possible_map);
-	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
-	       nodes[nid].start, nodes[nid].end,
-	       (nodes[nid].end - nodes[nid].start) >> 20);
-	return ret;
-}
-
-/*
- * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
- * to max_addr.  The return value is the number of nodes allocated.
- */
-static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
-{
-	nodemask_t physnode_mask = NODE_MASK_NONE;
-	u64 size;
-	int big;
-	int ret = 0;
-	int i;
-
-	if (nr_nodes <= 0)
-		return -1;
-	if (nr_nodes > MAX_NUMNODES) {
-		pr_info("numa=fake=%d too large, reducing to %d\n",
-			nr_nodes, MAX_NUMNODES);
-		nr_nodes = MAX_NUMNODES;
-	}
-
-	size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
-	/*
-	 * Calculate the number of big nodes that can be allocated as a result
-	 * of consolidating the remainder.
-	 */
-	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
-		FAKE_NODE_MIN_SIZE;
-
-	size &= FAKE_NODE_MIN_HASH_MASK;
-	if (!size) {
-		pr_err("Not enough memory for each node.  "
-			"NUMA emulation disabled.\n");
-		return -1;
-	}
 
-	for (i = 0; i < MAX_NUMNODES; i++)
-		if (physnodes[i].start != physnodes[i].end)
-			node_set(i, physnode_mask);
-
-	/*
-	 * Continue to fill physical nodes with fake nodes until there is no
-	 * memory left on any of them.
-	 */
-	while (nodes_weight(physnode_mask)) {
-		for_each_node_mask(i, physnode_mask) {
-			u64 end = physnodes[i].start + size;
-			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
-
-			if (ret < big)
-				end += FAKE_NODE_MIN_SIZE;
+		for (j = i + 1; j < mi->nr_blks; j++) {
+			struct numa_memblk *bj = &mi->blk[j];
+			unsigned long start, end;
 
 			/*
-			 * Continue to add memory to this fake node if its
-			 * non-reserved memory is less than the per-node size.
+			 * See whether there are overlapping blocks.  Whine
+			 * about but allow overlaps of the same nid.  They
+			 * will be merged below.
 			 */
-			while (end - physnodes[i].start -
-				memblock_x86_hole_size(physnodes[i].start, end) < size) {
-				end += FAKE_NODE_MIN_SIZE;
-				if (end > physnodes[i].end) {
-					end = physnodes[i].end;
-					break;
+			if (bi->end > bj->start && bi->start < bj->end) {
+				if (bi->nid != bj->nid) {
+					pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
+					       bi->nid, bi->start, bi->end,
+					       bj->nid, bj->start, bj->end);
+					return -EINVAL;
 				}
+				pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+					   bi->nid, bi->start, bi->end,
+					   bj->start, bj->end);
 			}
 
 			/*
-			 * If there won't be at least FAKE_NODE_MIN_SIZE of
-			 * non-reserved memory in ZONE_DMA32 for the next node,
-			 * this one must extend to the boundary.
+			 * Join together blocks on the same node, holes
+			 * between which don't overlap with memory on other
+			 * nodes.
 			 */
-			if (end < dma32_end && dma32_end - end -
-			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-				end = dma32_end;
-
-			/*
-			 * If there won't be enough non-reserved memory for the
-			 * next node, this one must extend to the end of the
-			 * physical node.
-			 */
-			if (physnodes[i].end - end -
-			    memblock_x86_hole_size(end, physnodes[i].end) < size)
-				end = physnodes[i].end;
-
-			/*
-			 * Avoid allocating more nodes than requested, which can
-			 * happen as a result of rounding down each node's size
-			 * to FAKE_NODE_MIN_SIZE.
-			 */
-			if (nodes_weight(physnode_mask) + ret >= nr_nodes)
-				end = physnodes[i].end;
-
-			if (setup_node_range(ret++, &physnodes[i].start,
-						end - physnodes[i].start,
-						physnodes[i].end) < 0)
-				node_clear(i, physnode_mask);
+			if (bi->nid != bj->nid)
+				continue;
+			start = max(min(bi->start, bj->start), low);
+			end = min(max(bi->end, bj->end), high);
+			for (k = 0; k < mi->nr_blks; k++) {
+				struct numa_memblk *bk = &mi->blk[k];
+
+				if (bi->nid == bk->nid)
+					continue;
+				if (start < bk->end && end > bk->start)
+					break;
+			}
+			if (k < mi->nr_blks)
+				continue;
+			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
+			       bi->nid, bi->start, bi->end, bj->start, bj->end,
+			       start, end);
+			bi->start = start;
+			bi->end = end;
+			numa_remove_memblk_from(j--, mi);
 		}
 	}
-	return ret;
-}
 
-/*
- * Returns the end address of a node so that there is at least `size' amount of
- * non-reserved memory or `max_addr' is reached.
- */
-static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
-{
-	u64 end = start + size;
-
-	while (end - start - memblock_x86_hole_size(start, end) < size) {
-		end += FAKE_NODE_MIN_SIZE;
-		if (end > max_addr) {
-			end = max_addr;
-			break;
-		}
+	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+		mi->blk[i].start = mi->blk[i].end = 0;
+		mi->blk[i].nid = NUMA_NO_NODE;
 	}
-	return end;
+
+	return 0;
 }
 
 /*
- * Sets up fake nodes of `size' interleaved over physical nodes ranging from
- * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ * Set nodes, which have memory in @mi, in *@nodemask.
  */
-static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+					      const struct numa_meminfo *mi)
 {
-	nodemask_t physnode_mask = NODE_MASK_NONE;
-	u64 min_size;
-	int ret = 0;
 	int i;
 
-	if (!size)
-		return -1;
-	/*
-	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
-	 * increased accordingly if the requested size is too small.  This
-	 * creates a uniform distribution of node sizes across the entire
-	 * machine (but not necessarily over physical nodes).
-	 */
-	min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
-						MAX_NUMNODES;
-	min_size = max(min_size, FAKE_NODE_MIN_SIZE);
-	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
-		min_size = (min_size + FAKE_NODE_MIN_SIZE) &
-						FAKE_NODE_MIN_HASH_MASK;
-	if (size < min_size) {
-		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
-			size >> 20, min_size >> 20);
-		size = min_size;
-	}
-	size &= FAKE_NODE_MIN_HASH_MASK;
-
-	for (i = 0; i < MAX_NUMNODES; i++)
-		if (physnodes[i].start != physnodes[i].end)
-			node_set(i, physnode_mask);
-	/*
-	 * Fill physical nodes with fake nodes of size until there is no memory
-	 * left on any of them.
-	 */
-	while (nodes_weight(physnode_mask)) {
-		for_each_node_mask(i, physnode_mask) {
-			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
-			u64 end;
-
-			end = find_end_of_node(physnodes[i].start,
-						physnodes[i].end, size);
-			/*
-			 * If there won't be at least FAKE_NODE_MIN_SIZE of
-			 * non-reserved memory in ZONE_DMA32 for the next node,
-			 * this one must extend to the boundary.
-			 */
-			if (end < dma32_end && dma32_end - end -
-			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
-				end = dma32_end;
+	for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+		if (mi->blk[i].start != mi->blk[i].end &&
+		    mi->blk[i].nid != NUMA_NO_NODE)
+			node_set(mi->blk[i].nid, *nodemask);
+}
 
-			/*
-			 * If there won't be enough non-reserved memory for the
-			 * next node, this one must extend to the end of the
-			 * physical node.
-			 */
-			if (physnodes[i].end - end -
-			    memblock_x86_hole_size(end, physnodes[i].end) < size)
-				end = physnodes[i].end;
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed.  The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
 
-			/*
-			 * Setup the fake node that will be allocated as bootmem
-			 * later.  If setup_node_range() returns non-zero, there
-			 * is no more memory available on this physical node.
-			 */
-			if (setup_node_range(ret++, &physnodes[i].start,
-						end - physnodes[i].start,
-						physnodes[i].end) < 0)
-				node_clear(i, physnode_mask);
-		}
-	}
-	return ret;
+	/* numa_distance could be 1LU marking allocation failure, test cnt */
+	if (numa_distance_cnt)
+		memblock_x86_free_range(__pa(numa_distance),
+					__pa(numa_distance) + size);
+	numa_distance_cnt = 0;
+	numa_distance = NULL;	/* enable table creation */
 }
 
-/*
- * Sets up the system RAM area from start_pfn to last_pfn according to the
- * numa=fake command-line option.
- */
-static int __init numa_emulation(unsigned long start_pfn,
-			unsigned long last_pfn, int acpi, int amd)
+static int __init numa_alloc_distance(void)
 {
-	u64 addr = start_pfn << PAGE_SHIFT;
-	u64 max_addr = last_pfn << PAGE_SHIFT;
-	int num_nodes;
-	int i;
+	nodemask_t nodes_parsed;
+	size_t size;
+	int i, j, cnt = 0;
+	u64 phys;
 
-	/*
-	 * If the numa=fake command-line contains a 'M' or 'G', it represents
-	 * the fixed node size.  Otherwise, if it is just a single number N,
-	 * split the system RAM into N fake nodes.
-	 */
-	if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
-		u64 size;
+	/* size the new table and allocate it */
+	nodes_parsed = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
 
-		size = memparse(cmdline, &cmdline);
-		num_nodes = split_nodes_size_interleave(addr, max_addr, size);
-	} else {
-		unsigned long n;
+	for_each_node_mask(i, nodes_parsed)
+		cnt = i;
+	cnt++;
+	size = cnt * cnt * sizeof(numa_distance[0]);
 
-		n = simple_strtoul(cmdline, NULL, 0);
-		num_nodes = split_nodes_interleave(addr, max_addr, n);
+	phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
+				      size, PAGE_SIZE);
+	if (phys == MEMBLOCK_ERROR) {
+		pr_warning("NUMA: Warning: can't allocate distance table!\n");
+		/* don't retry until explicitly reset */
+		numa_distance = (void *)1LU;
+		return -ENOMEM;
 	}
+	memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
 
-	if (num_nodes < 0)
-		return num_nodes;
-	memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
-	if (memnode_shift < 0) {
-		memnode_shift = 0;
-		printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
-		       "disabled.\n");
-		return -1;
-	}
+	numa_distance = __va(phys);
+	numa_distance_cnt = cnt;
+
+	/* fill with the default distances */
+	for (i = 0; i < cnt; i++)
+		for (j = 0; j < cnt; j++)
+			numa_distance[i * cnt + j] = i == j ?
+				LOCAL_DISTANCE : REMOTE_DISTANCE;
+	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
 
-	/*
-	 * We need to vacate all active ranges that may have been registered for
-	 * the e820 memory map.
-	 */
-	remove_all_active_ranges();
-	for_each_node_mask(i, node_possible_map) {
-		memblock_x86_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
-						nodes[i].end >> PAGE_SHIFT);
-		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	}
-	setup_physnodes(addr, max_addr, acpi, amd);
-	fake_physnodes(acpi, amd, num_nodes);
-	numa_init_array();
 	return 0;
 }
-#endif /* CONFIG_NUMA_EMU */
 
-void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
-				int acpi, int amd)
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.  If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node at the time of
+ * table creation or @distance doesn't make sense, the call is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
 {
-	int i;
-
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-
-#ifdef CONFIG_NUMA_EMU
-	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-			acpi, amd);
-	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
+	if (!numa_distance && numa_alloc_distance() < 0)
 		return;
-	setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
-			acpi, amd);
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-#endif
 
-#ifdef CONFIG_ACPI_NUMA
-	if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
-						  last_pfn << PAGE_SHIFT))
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
+		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+			    from, to, distance);
 		return;
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-#endif
+	}
 
-#ifdef CONFIG_AMD_NUMA
-	if (!numa_off && amd && !amd_scan_nodes())
+	if ((u8)distance != distance ||
+	    (from == to && distance != LOCAL_DISTANCE)) {
+		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+			     from, to, distance);
 		return;
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-#endif
-	printk(KERN_INFO "%s\n",
-	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
+	}
 
-	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
-	       start_pfn << PAGE_SHIFT,
-	       last_pfn << PAGE_SHIFT);
-	/* setup dummy node covering all memory */
-	memnode_shift = 63;
-	memnodemap = memnode.embedded_map;
-	memnodemap[0] = 0;
-	node_set_online(0);
-	node_set(0, node_possible_map);
-	for (i = 0; i < nr_cpu_ids; i++)
-		numa_set_node(i, 0);
-	memblock_x86_register_active_regions(0, start_pfn, last_pfn);
-	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
+	numa_distance[from * numa_distance_cnt + to] = distance;
 }
 
-unsigned long __init numa_free_all_bootmem(void)
+int __node_distance(int from, int to)
 {
-	unsigned long pages = 0;
-	int i;
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+	return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
 
-	for_each_online_node(i)
-		pages += free_all_bootmem_node(NODE_DATA(i));
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common).  Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+	unsigned long numaram, e820ram;
+	int i;
 
-	pages += free_all_memory_core_early(MAX_NUMNODES);
+	numaram = 0;
+	for (i = 0; i < mi->nr_blks; i++) {
+		unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
+		unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
+		numaram += e - s;
+		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+		if ((long)numaram < 0)
+			numaram = 0;
+	}
 
-	return pages;
+	e820ram = max_pfn - (memblock_x86_hole_size(0,
+					max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
+	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+	if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
+		       (numaram << PAGE_SHIFT) >> 20,
+		       (e820ram << PAGE_SHIFT) >> 20);
+		return false;
+	}
+	return true;
 }
 
-#ifdef CONFIG_NUMA
-
-static __init int find_near_online_node(int node)
+static int __init numa_register_memblks(struct numa_meminfo *mi)
 {
-	int n, val;
-	int min_val = INT_MAX;
-	int best_node = -1;
+	int i, nid;
 
-	for_each_online_node(n) {
-		val = node_distance(node, n);
+	/* Account for nodes with cpus and no memory */
+	node_possible_map = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&node_possible_map, mi);
+	if (WARN_ON(nodes_empty(node_possible_map)))
+		return -EINVAL;
+
+	memnode_shift = compute_hash_shift(mi);
+	if (memnode_shift < 0) {
+		printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
+		return -EINVAL;
+	}
 
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
+	for (i = 0; i < mi->nr_blks; i++)
+		memblock_x86_register_active_regions(mi->blk[i].nid,
+					mi->blk[i].start >> PAGE_SHIFT,
+					mi->blk[i].end >> PAGE_SHIFT);
+
+	/* for out of order entries */
+	sort_node_map();
+	if (!numa_meminfo_cover_memory(mi))
+		return -EINVAL;
+
+	/* Finally register nodes. */
+	for_each_node_mask(nid, node_possible_map) {
+		u64 start = (u64)max_pfn << PAGE_SHIFT;
+		u64 end = 0;
+
+		for (i = 0; i < mi->nr_blks; i++) {
+			if (nid != mi->blk[i].nid)
+				continue;
+			start = min(mi->blk[i].start, start);
+			end = max(mi->blk[i].end, end);
 		}
+
+		if (start < end)
+			setup_node_bootmem(nid, start, end);
 	}
 
-	return best_node;
+	return 0;
 }
 
-/*
- * Setup early cpu_to_node.
+/**
+ * dummy_numma_init - Fallback dummy NUMA init
  *
- * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
- * and apicid_to_node[] tables have valid entries for a CPU.
- * This means we skip cpu_to_node[] initialisation for NUMA
- * emulation and faking node case (when running a kernel compiled
- * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
- * is already initialized in a round robin manner at numa_init_array,
- * prior to this call, and this initialization is good enough
- * for the fake NUMA cases.
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
  *
- * Called before the per_cpu areas are setup.
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory.  This function must not fail.
  */
-void __init init_cpu_to_node(void)
+static int __init dummy_numa_init(void)
 {
-	int cpu;
-	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-
-	BUG_ON(cpu_to_apicid == NULL);
+	printk(KERN_INFO "%s\n",
+	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
+	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
+	       0LU, max_pfn << PAGE_SHIFT);
 
-	for_each_possible_cpu(cpu) {
-		int node;
-		u16 apicid = cpu_to_apicid[cpu];
+	node_set(0, numa_nodes_parsed);
+	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
 
-		if (apicid == BAD_APICID)
-			continue;
-		node = apicid_to_node[apicid];
-		if (node == NUMA_NO_NODE)
-			continue;
-		if (!node_online(node))
-			node = find_near_online_node(node);
-		numa_set_node(cpu, node);
-	}
+	return 0;
 }
-#endif
 
-
-void __cpuinit numa_set_node(int cpu, int node)
+static int __init numa_init(int (*init_func)(void))
 {
-	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
-
-	/* early setting, no percpu area yet */
-	if (cpu_to_node_map) {
-		cpu_to_node_map[cpu] = node;
-		return;
-	}
-
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
-		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
-		dump_stack();
-		return;
-	}
-#endif
-	per_cpu(x86_cpu_to_node_map, cpu) = node;
+	int i;
+	int ret;
 
-	if (node != NUMA_NO_NODE)
-		set_cpu_numa_node(cpu, node);
-}
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		set_apicid_to_node(i, NUMA_NO_NODE);
 
-void __cpuinit numa_clear_node(int cpu)
-{
-	numa_set_node(cpu, NUMA_NO_NODE);
-}
+	nodes_clear(numa_nodes_parsed);
+	nodes_clear(node_possible_map);
+	nodes_clear(node_online_map);
+	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+	remove_all_active_ranges();
+	numa_reset_distance();
 
-#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+	ret = init_func();
+	if (ret < 0)
+		return ret;
+	ret = numa_cleanup_meminfo(&numa_meminfo);
+	if (ret < 0)
+		return ret;
 
-#ifndef CONFIG_NUMA_EMU
-void __cpuinit numa_add_cpu(int cpu)
-{
-	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
+	numa_emulation(&numa_meminfo, numa_distance_cnt);
 
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
-}
-#else
-void __cpuinit numa_add_cpu(int cpu)
-{
-	unsigned long addr;
-	u16 apicid;
-	int physnid;
-	int nid = NUMA_NO_NODE;
+	ret = numa_register_memblks(&numa_meminfo);
+	if (ret < 0)
+		return ret;
 
-	apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-	if (apicid != BAD_APICID)
-		nid = apicid_to_node[apicid];
-	if (nid == NUMA_NO_NODE)
-		nid = early_cpu_to_node(cpu);
-	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
-
-	/*
-	 * Use the starting address of the emulated node to find which physical
-	 * node it is allocated on.
-	 */
-	addr = node_start_pfn(nid) << PAGE_SHIFT;
-	for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
-		if (addr >= physnodes[physnid].start &&
-		    addr < physnodes[physnid].end)
-			break;
+	for (i = 0; i < nr_cpu_ids; i++) {
+		int nid = early_cpu_to_node(i);
 
-	/*
-	 * Map the cpu to each emulated node that is allocated on the physical
-	 * node of the cpu's apic id.
-	 */
-	for_each_online_node(nid) {
-		addr = node_start_pfn(nid) << PAGE_SHIFT;
-		if (addr >= physnodes[physnid].start &&
-		    addr < physnodes[physnid].end)
-			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+		if (nid == NUMA_NO_NODE)
+			continue;
+		if (!node_online(nid))
+			numa_clear_node(i);
 	}
+	numa_init_array();
+	return 0;
 }
 
-void __cpuinit numa_remove_cpu(int cpu)
+void __init initmem_init(void)
 {
-	int i;
+	int ret;
 
-	for_each_online_node(i)
-		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
-}
-#endif /* !CONFIG_NUMA_EMU */
-
-#else /* CONFIG_DEBUG_PER_CPU_MAPS */
-static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
-{
-	int node = early_cpu_to_node(cpu);
-	struct cpumask *mask;
-	char buf[64];
-
-	mask = node_to_cpumask_map[node];
-	if (!mask) {
-		pr_err("node_to_cpumask_map[%i] NULL\n", node);
-		dump_stack();
-		return NULL;
+	if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+		ret = numa_init(x86_acpi_numa_init);
+		if (!ret)
+			return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+		ret = numa_init(amd_numa_init);
+		if (!ret)
+			return;
+#endif
 	}
 
-	cpulist_scnprintf(buf, sizeof(buf), mask);
-	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
-		enable ? "numa_add_cpu" : "numa_remove_cpu",
-		cpu, node, buf);
-	return mask;
+	numa_init(dummy_numa_init);
 }
 
-/*
- * --------- debug versions of the numa functions ---------
- */
-#ifndef CONFIG_NUMA_EMU
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
-{
-	struct cpumask *mask;
-
-	mask = debug_cpumask_set_cpu(cpu, enable);
-	if (!mask)
-		return;
-
-	if (enable)
-		cpumask_set_cpu(cpu, mask);
-	else
-		cpumask_clear_cpu(cpu, mask);
-}
-#else
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
+unsigned long __init numa_free_all_bootmem(void)
 {
-	int node = early_cpu_to_node(cpu);
-	struct cpumask *mask;
+	unsigned long pages = 0;
 	int i;
 
-	for_each_online_node(i) {
-		unsigned long addr;
-
-		addr = node_start_pfn(i) << PAGE_SHIFT;
-		if (addr < physnodes[node].start ||
-					addr >= physnodes[node].end)
-			continue;
-		mask = debug_cpumask_set_cpu(cpu, enable);
-		if (!mask)
-			return;
-
-		if (enable)
-			cpumask_set_cpu(cpu, mask);
-		else
-			cpumask_clear_cpu(cpu, mask);
-	}
-}
-#endif /* CONFIG_NUMA_EMU */
+	for_each_online_node(i)
+		pages += free_all_bootmem_node(NODE_DATA(i));
 
-void __cpuinit numa_add_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, 1);
-}
+	pages += free_all_memory_core_early(MAX_NUMNODES);
 
-void __cpuinit numa_remove_cpu(int cpu)
-{
-	numa_set_cpumask(cpu, 0);
+	return pages;
 }
 
-int __cpu_to_node(int cpu)
+int __cpuinit numa_cpu_node(int cpu)
 {
-	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
-		printk(KERN_WARNING
-			"cpu_to_node(%d): usage too early!\n", cpu);
-		dump_stack();
-		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-	}
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-EXPORT_SYMBOL(__cpu_to_node);
+	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
 
-/*
- * Same function as cpu_to_node() but used if called before the
- * per_cpu areas are setup.
- */
-int early_cpu_to_node(int cpu)
-{
-	if (early_per_cpu_ptr(x86_cpu_to_node_map))
-		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
-
-	if (!cpu_possible(cpu)) {
-		printk(KERN_WARNING
-			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
-		dump_stack();
-		return NUMA_NO_NODE;
-	}
-	return per_cpu(x86_cpu_to_node_map, cpu);
+	if (apicid != BAD_APICID)
+		return __apicid_to_node[apicid];
+	return NUMA_NO_NODE;
 }
-
-/*
- * --------- end of debug versions of the numa functions ---------
- */
-
-#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..ad091e4cff17
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,494 @@
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <asm/dma.h>
+
+#include "numa_internal.h"
+
+static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
+static char *emu_cmdline __initdata;
+
+void __init numa_emu_cmdline(char *str)
+{
+	emu_cmdline = str;
+}
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].nid == nid)
+			return i;
+	return -ENOENT;
+}
+
+/*
+ * Sets up nid to range from @start to @end.  The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+				   struct numa_meminfo *pi,
+				   int nid, int phys_blk, u64 size)
+{
+	struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+	struct numa_memblk *pb = &pi->blk[phys_blk];
+
+	if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+		return -EINVAL;
+	}
+
+	ei->nr_blks++;
+	eb->start = pb->start;
+	eb->end = pb->start + size;
+	eb->nid = nid;
+
+	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+		emu_nid_to_phys[nid] = pb->nid;
+
+	pb->start += size;
+	if (pb->start >= pb->end) {
+		WARN_ON_ONCE(pb->start > pb->end);
+		numa_remove_memblk_from(phys_blk, pi);
+	}
+
+	printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
+	       eb->start, eb->end, (eb->end - eb->start) >> 20);
+	return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+					 struct numa_meminfo *pi,
+					 u64 addr, u64 max_addr, int nr_nodes)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 size;
+	int big;
+	int nid = 0;
+	int i, ret;
+
+	if (nr_nodes <= 0)
+		return -1;
+	if (nr_nodes > MAX_NUMNODES) {
+		pr_info("numa=fake=%d too large, reducing to %d\n",
+			nr_nodes, MAX_NUMNODES);
+		nr_nodes = MAX_NUMNODES;
+	}
+
+	size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
+	/*
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the remainder.
+	 */
+	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+		FAKE_NODE_MIN_SIZE;
+
+	size &= FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		pr_err("Not enough memory for each node.  "
+			"NUMA emulation disabled.\n");
+		return -1;
+	}
+
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
+
+	/*
+	 * Continue to fill physical nodes with fake nodes until there is no
+	 * memory left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+			end = start + size;
+
+			if (nid < big)
+				end += FAKE_NODE_MIN_SIZE;
+
+			/*
+			 * Continue to add memory to this fake node if its
+			 * non-reserved memory is less than the per-node size.
+			 */
+			while (end - start -
+			       memblock_x86_hole_size(start, end) < size) {
+				end += FAKE_NODE_MIN_SIZE;
+				if (end > limit) {
+					end = limit;
+					break;
+				}
+			}
+
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (limit - end -
+			    memblock_x86_hole_size(end, limit) < size)
+				end = limit;
+
+			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+	u64 end = start + size;
+
+	while (end - start - memblock_x86_hole_size(start, end) < size) {
+		end += FAKE_NODE_MIN_SIZE;
+		if (end > max_addr) {
+			end = max_addr;
+			break;
+		}
+	}
+	return end;
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+					      struct numa_meminfo *pi,
+					      u64 addr, u64 max_addr, u64 size)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 min_size;
+	int nid = 0;
+	int i, ret;
+
+	if (!size)
+		return -1;
+	/*
+	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+	 * increased accordingly if the requested size is too small.  This
+	 * creates a uniform distribution of node sizes across the entire
+	 * machine (but not necessarily over physical nodes).
+	 */
+	min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
+						MAX_NUMNODES;
+	min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+		min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+						FAKE_NODE_MIN_HASH_MASK;
+	if (size < min_size) {
+		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+			size >> 20, min_size >> 20);
+		size = min_size;
+	}
+	size &= FAKE_NODE_MIN_HASH_MASK;
+
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
+
+	/*
+	 * Fill physical nodes with fake nodes of size until there is no memory
+	 * left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+
+			end = find_end_of_node(start, limit, size);
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (limit - end -
+			    memblock_x86_hole_size(end, limit) < size)
+				end = limit;
+
+			ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success.  @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ *   emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ *   nodes.  The distances are determined considering how emulated nodes
+ *   are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+	static struct numa_meminfo ei __initdata;
+	static struct numa_meminfo pi __initdata;
+	const u64 max_addr = max_pfn << PAGE_SHIFT;
+	u8 *phys_dist = NULL;
+	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+	int max_emu_nid, dfl_phys_nid;
+	int i, j, ret;
+
+	if (!emu_cmdline)
+		goto no_emu;
+
+	memset(&ei, 0, sizeof(ei));
+	pi = *numa_meminfo;
+
+	for (i = 0; i < MAX_NUMNODES; i++)
+		emu_nid_to_phys[i] = NUMA_NO_NODE;
+
+	/*
+	 * If the numa=fake command-line contains a 'M' or 'G', it represents
+	 * the fixed node size.  Otherwise, if it is just a single number N,
+	 * split the system RAM into N fake nodes.
+	 */
+	if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+		u64 size;
+
+		size = memparse(emu_cmdline, &emu_cmdline);
+		ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+	} else {
+		unsigned long n;
+
+		n = simple_strtoul(emu_cmdline, NULL, 0);
+		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+	}
+
+	if (ret < 0)
+		goto no_emu;
+
+	if (numa_cleanup_meminfo(&ei) < 0) {
+		pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+		goto no_emu;
+	}
+
+	/* copy the physical distance table */
+	if (numa_dist_cnt) {
+		u64 phys;
+
+		phys = memblock_find_in_range(0,
+					      (u64)max_pfn_mapped << PAGE_SHIFT,
+					      phys_size, PAGE_SIZE);
+		if (phys == MEMBLOCK_ERROR) {
+			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+			goto no_emu;
+		}
+		memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
+		phys_dist = __va(phys);
+
+		for (i = 0; i < numa_dist_cnt; i++)
+			for (j = 0; j < numa_dist_cnt; j++)
+				phys_dist[i * numa_dist_cnt + j] =
+					node_distance(i, j);
+	}
+
+	/*
+	 * Determine the max emulated nid and the default phys nid to use
+	 * for unmapped nodes.
+	 */
+	max_emu_nid = 0;
+	dfl_phys_nid = NUMA_NO_NODE;
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+			max_emu_nid = i;
+			if (dfl_phys_nid == NUMA_NO_NODE)
+				dfl_phys_nid = emu_nid_to_phys[i];
+		}
+	}
+	if (dfl_phys_nid == NUMA_NO_NODE) {
+		pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
+		goto no_emu;
+	}
+
+	/* commit */
+	*numa_meminfo = ei;
+
+	/*
+	 * Transform __apicid_to_node table to use emulated nids by
+	 * reverse-mapping phys_nid.  The maps should always exist but fall
+	 * back to zero just in case.
+	 */
+	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+		if (__apicid_to_node[i] == NUMA_NO_NODE)
+			continue;
+		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+			if (__apicid_to_node[i] == emu_nid_to_phys[j])
+				break;
+		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+	}
+
+	/* make sure all emulated nodes are mapped to a physical node */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+			emu_nid_to_phys[i] = dfl_phys_nid;
+
+	/* transform distance table */
+	numa_reset_distance();
+	for (i = 0; i < max_emu_nid + 1; i++) {
+		for (j = 0; j < max_emu_nid + 1; j++) {
+			int physi = emu_nid_to_phys[i];
+			int physj = emu_nid_to_phys[j];
+			int dist;
+
+			if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+				dist = physi == physj ?
+					LOCAL_DISTANCE : REMOTE_DISTANCE;
+			else
+				dist = phys_dist[physi * numa_dist_cnt + physj];
+
+			numa_set_distance(i, j, dist);
+		}
+	}
+
+	/* free the copied physical distance table */
+	if (phys_dist)
+		memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
+	return;
+
+no_emu:
+	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		emu_nid_to_phys[i] = i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void __cpuinit numa_add_cpu(int cpu)
+{
+	int physnid, nid;
+
+	nid = early_cpu_to_node(cpu);
+	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+	physnid = emu_nid_to_phys[nid];
+
+	/*
+	 * Map the cpu to each emulated node that is allocated on the physical
+	 * node of the cpu's apic id.
+	 */
+	for_each_online_node(nid)
+		if (emu_nid_to_phys[nid] == physnid)
+			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	int i;
+
+	for_each_online_node(i)
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+	struct cpumask *mask;
+	int nid, physnid, i;
+
+	nid = early_cpu_to_node(cpu);
+	if (nid == NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return;
+	}
+
+	physnid = emu_nid_to_phys[nid];
+
+	for_each_online_node(i) {
+		if (emu_nid_to_phys[nid] != physnid)
+			continue;
+
+		mask = debug_cpumask_set_cpu(cpu, enable);
+		if (!mask)
+			return;
+
+		if (enable)
+			cpumask_set_cpu(cpu, mask);
+		else
+			cpumask_clear_cpu(cpu, mask);
+	}
+}
+
+void __cpuinit numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 1);
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, 0);
+}
+#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..ef2d97377d7c
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,31 @@
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+
+#include <linux/types.h>
+#include <asm/numa.h>
+
+struct numa_memblk {
+	u64			start;
+	u64			end;
+	int			nid;
+};
+
+struct numa_meminfo {
+	int			nr_blks;
+	struct numa_memblk	blk[NR_NODE_MEMBLKS];
+};
+
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+			   int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+				  int numa_dist_cnt)
+{ }
+#endif
+
+#endif	/* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 8b830ca14ac4..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -57,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];
 
 void update_page_count(int level, unsigned long pages)
 {
-	unsigned long flags;
-
 	/* Protect against CPA */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	direct_pages_count[level] += pages;
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 static void split_page_count(int level)
@@ -256,7 +254,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 				   unsigned long pfn)
 {
 	pgprot_t forbidden = __pgprot(0);
-	pgprot_t required = __pgprot(0);
 
 	/*
 	 * The BIOS area between 640k and 1Mb needs to be executable for
@@ -282,12 +279,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
 		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
-	/*
-	 * .data and .bss should always be writable.
-	 */
-	if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
-	    within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
-		pgprot_val(required) |= _PAGE_RW;
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
@@ -319,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		 * these shared mappings are made of small page mappings.
 		 * Thus this don't enforce !RW mapping for small page kernel
 		 * text mapping logic will help Linux Xen parvirt guest boot
-		 * aswell.
+		 * as well.
 		 */
 		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 			pgprot_val(forbidden) |= _PAGE_RW;
@@ -327,7 +318,6 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 #endif
 
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
-	prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
 
 	return prot;
 }
@@ -402,7 +392,7 @@ static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
-	unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
@@ -411,7 +401,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	if (cpa->force_split)
 		return 1;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up already:
@@ -506,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 	}
 
 out_unlock:
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return do_split;
 }
 
 static int split_large_page(pte_t *kpte, unsigned long address)
 {
-	unsigned long flags, pfn, pfninc = 1;
+	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *pbase, *tmp;
 	pgprot_t ref_prot;
@@ -527,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
 	if (!base)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
@@ -599,7 +589,7 @@ out_unlock:
 	 */
 	if (base)
 		__free_page(base);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return 0;
 }
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 500242d3c96d..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -121,14 +121,12 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 
 static void pgd_dtor(pgd_t *pgd)
 {
-	unsigned long flags; /* can be called from interrupt context */
-
 	if (SHARED_KERNEL_PMD)
 		return;
 
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 	pgd_list_del(pgd);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 }
 
 /*
@@ -170,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 	 * section 8.1: in PAE mode we explicitly have to flush the
 	 * TLB via cr3 if the top-level pgd is changed...
 	 */
-	if (mm == current->active_mm)
-		write_cr3(read_cr3());
+	flush_tlb_mm(mm);
 }
 #else  /* !CONFIG_X86_PAE */
 
@@ -260,7 +257,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
-	unsigned long flags;
 
 	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
 
@@ -280,12 +276,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	 * respect to anything walking the pgd_list, so that they
 	 * never see a partially populated pgd.
 	 */
-	spin_lock_irqsave(&pgd_lock, flags);
+	spin_lock(&pgd_lock);
 
 	pgd_ctor(mm, pgd);
 	pgd_prepopulate_pmd(mm, pgd, pmds);
 
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	spin_unlock(&pgd_lock);
 
 	return pgd;
 
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae96e7b8051d..48651c6f657d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -57,7 +57,7 @@ struct node_memory_chunk_s {
 static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 
 static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_APICID];
+static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
 
 int acpi_numa __initdata;
 
@@ -254,8 +254,8 @@ int __init get_memcfg_from_srat(void)
 	printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
 			 num_memory_chunks);
 
-	for (i = 0; i < MAX_APICID; i++)
-		apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
 
 	for (j = 0; j < num_memory_chunks; j++){
 		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 603d285d1daa..8e9d3394f6d4 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,88 +26,34 @@
 
 int acpi_numa __initdata;
 
-static struct acpi_table_slit *acpi_slit;
-
-static nodemask_t nodes_parsed __initdata;
-static nodemask_t cpu_nodes_parsed __initdata;
-static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
 
-static int num_node_memblks __initdata;
-static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
-static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
-
 static __init int setup_node(int pxm)
 {
 	return acpi_map_pxm_to_node(pxm);
 }
 
-static __init int conflicting_memblks(unsigned long start, unsigned long end)
-{
-	int i;
-	for (i = 0; i < num_node_memblks; i++) {
-		struct bootnode *nd = &node_memblk_range[i];
-		if (nd->start == nd->end)
-			continue;
-		if (nd->end > start && nd->start < end)
-			return memblk_nodeid[i];
-		if (nd->end == end && nd->start == start)
-			return memblk_nodeid[i];
-	}
-	return -1;
-}
-
-static __init void cutoff_node(int i, unsigned long start, unsigned long end)
-{
-	struct bootnode *nd = &nodes[i];
-
-	if (nd->start < start) {
-		nd->start = start;
-		if (nd->end < nd->start)
-			nd->start = nd->end;
-	}
-	if (nd->end > end) {
-		nd->end = end;
-		if (nd->start > nd->end)
-			nd->start = nd->end;
-	}
-}
-
 static __init void bad_srat(void)
 {
-	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		apicid_to_node[i] = NUMA_NO_NODE;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		nodes[i].start = nodes[i].end = 0;
-		nodes_add[i].start = nodes_add[i].end = 0;
-	}
-	remove_all_active_ranges();
+	memset(nodes_add, 0, sizeof(nodes_add));
 }
 
 static __init inline int srat_disabled(void)
 {
-	return numa_off || acpi_numa < 0;
+	return acpi_numa < 0;
 }
 
 /* Callback for SLIT parsing */
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
-	unsigned length;
-	unsigned long phys;
-
-	length = slit->header.length;
-	phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length,
-		 PAGE_SIZE);
-
-	if (phys == MEMBLOCK_ERROR)
-		panic(" Can not save slit!\n");
+	int i, j;
 
-	acpi_slit = __va(phys);
-	memcpy(acpi_slit, slit, length);
-	memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT");
+	for (i = 0; i < slit->locality_count; i++)
+		for (j = 0; j < slit->locality_count; j++)
+			numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+				slit->entry[slit->locality_count * i + j]);
 }
 
 /* Callback for Proximity Domain -> x2APIC mapping */
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
 		return;
 	}
-	apicid_to_node[apic_id] = node;
-	node_set(node, cpu_nodes_parsed);
+	set_apicid_to_node(apic_id, node);
+	node_set(node, numa_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
 	       pxm, apic_id, node);
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		return;
 	}
 
-	apicid_to_node[apic_id] = node;
-	node_set(node, cpu_nodes_parsed);
+	set_apicid_to_node(apic_id, node);
+	node_set(node, numa_nodes_parsed);
 	acpi_numa = 1;
 	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
 	       pxm, apic_id, node);
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
 	}
 
 	if (changed) {
-		node_set(node, cpu_nodes_parsed);
+		node_set(node, numa_nodes_parsed);
 		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
 				 nd->start, nd->end);
 	}
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
 void __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
-	struct bootnode *nd, oldnode;
 	unsigned long start, end;
 	int node, pxm;
-	int i;
 
 	if (srat_disabled())
 		return;
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		bad_srat();
 		return;
 	}
-	i = conflicting_memblks(start, end);
-	if (i == node) {
-		printk(KERN_WARNING
-		"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
-			pxm, start, end, nodes[i].start, nodes[i].end);
-	} else if (i >= 0) {
-		printk(KERN_ERR
-		       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
-		       pxm, start, end, node_to_pxm(i),
-			nodes[i].start, nodes[i].end);
+
+	if (numa_add_memblk(node, start, end) < 0) {
 		bad_srat();
 		return;
 	}
-	nd = &nodes[node];
-	oldnode = *nd;
-	if (!node_test_and_set(node, nodes_parsed)) {
-		nd->start = start;
-		nd->end = end;
-	} else {
-		if (start < nd->start)
-			nd->start = start;
-		if (nd->end < end)
-			nd->end = end;
-	}
 
 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 	       start, end);
 
-	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
 		update_nodes_add(node, start, end);
-		/* restore nodes[node] */
-		*nd = oldnode;
-		if ((nd->start | nd->end) == 0)
-			node_clear(node, nodes_parsed);
-	}
-
-	node_memblk_range[num_node_memblks].start = start;
-	node_memblk_range[num_node_memblks].end = end;
-	memblk_nodeid[num_node_memblks] = node;
-	num_node_memblks++;
-}
-
-/* Sanity check to catch more bad SRATs (they are amazingly common).
-   Make sure the PXMs cover all memory. */
-static int __init nodes_cover_memory(const struct bootnode *nodes)
-{
-	int i;
-	unsigned long pxmram, e820ram;
-
-	pxmram = 0;
-	for_each_node_mask(i, nodes_parsed) {
-		unsigned long s = nodes[i].start >> PAGE_SHIFT;
-		unsigned long e = nodes[i].end >> PAGE_SHIFT;
-		pxmram += e - s;
-		pxmram -= __absent_pages_in_range(i, s, e);
-		if ((long)pxmram < 0)
-			pxmram = 0;
-	}
-
-	e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
-	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-	if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
-		printk(KERN_ERR
-	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
-			(pxmram << PAGE_SHIFT) >> 20,
-			(e820ram << PAGE_SHIFT) >> 20);
-		return 0;
-	}
-	return 1;
 }
 
 void __init acpi_numa_arch_fixup(void) {}
 
-#ifdef CONFIG_NUMA_EMU
-void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
-				unsigned long end)
-{
-	int i;
-
-	for_each_node_mask(i, nodes_parsed) {
-		cutoff_node(i, start, end);
-		physnodes[i].start = nodes[i].start;
-		physnodes[i].end = nodes[i].end;
-	}
-}
-#endif /* CONFIG_NUMA_EMU */
-
-/* Use the information discovered above to actually set up the nodes. */
-int __init acpi_scan_nodes(unsigned long start, unsigned long end)
+int __init x86_acpi_numa_init(void)
 {
-	int i;
-
-	if (acpi_numa <= 0)
-		return -1;
-
-	/* First clean up the node list */
-	for (i = 0; i < MAX_NUMNODES; i++)
-		cutoff_node(i, start, end);
-
-	/*
-	 * Join together blocks on the same node, holes between
-	 * which don't overlap with memory on other nodes.
-	 */
-	for (i = 0; i < num_node_memblks; ++i) {
-		int j, k;
-
-		for (j = i + 1; j < num_node_memblks; ++j) {
-			unsigned long start, end;
-
-			if (memblk_nodeid[i] != memblk_nodeid[j])
-				continue;
-			start = min(node_memblk_range[i].end,
-			            node_memblk_range[j].end);
-			end = max(node_memblk_range[i].start,
-			          node_memblk_range[j].start);
-			for (k = 0; k < num_node_memblks; ++k) {
-				if (memblk_nodeid[i] == memblk_nodeid[k])
-					continue;
-				if (start < node_memblk_range[k].end &&
-				    end > node_memblk_range[k].start)
-					break;
-			}
-			if (k < num_node_memblks)
-				continue;
-			start = min(node_memblk_range[i].start,
-			            node_memblk_range[j].start);
-			end = max(node_memblk_range[i].end,
-			          node_memblk_range[j].end);
-			printk(KERN_INFO "SRAT: Node %d "
-			       "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
-			       memblk_nodeid[i],
-			       node_memblk_range[i].start,
-			       node_memblk_range[i].end,
-			       node_memblk_range[j].start,
-			       node_memblk_range[j].end,
-			       start, end);
-			node_memblk_range[i].start = start;
-			node_memblk_range[i].end = end;
-			k = --num_node_memblks - j;
-			memmove(memblk_nodeid + j, memblk_nodeid + j+1,
-				k * sizeof(*memblk_nodeid));
-			memmove(node_memblk_range + j, node_memblk_range + j+1,
-				k * sizeof(*node_memblk_range));
-			--j;
-		}
-	}
-
-	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
-					   memblk_nodeid);
-	if (memnode_shift < 0) {
-		printk(KERN_ERR
-		     "SRAT: No NUMA node hash function found. Contact maintainer\n");
-		bad_srat();
-		return -1;
-	}
-
-	for (i = 0; i < num_node_memblks; i++)
-		memblock_x86_register_active_regions(memblk_nodeid[i],
-				node_memblk_range[i].start >> PAGE_SHIFT,
-				node_memblk_range[i].end >> PAGE_SHIFT);
-
-	/* for out of order entries in SRAT */
-	sort_node_map();
-	if (!nodes_cover_memory(nodes)) {
-		bad_srat();
-		return -1;
-	}
+	int ret;
 
-	/* Account for nodes with cpus and no memory */
-	nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
-
-	/* Finally register nodes */
-	for_each_node_mask(i, node_possible_map)
-		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-	/* Try again in case setup_node_bootmem missed one due
-	   to missing bootmem */
-	for_each_node_mask(i, node_possible_map)
-		if (!node_online(i))
-			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		int node = early_cpu_to_node(i);
-
-		if (node == NUMA_NO_NODE)
-			continue;
-		if (!node_online(node))
-			numa_clear_node(i);
-	}
-	numa_init_array();
-	return 0;
-}
-
-#ifdef CONFIG_NUMA_EMU
-static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
-	[0 ... MAX_NUMNODES-1] = PXM_INVAL
-};
-static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
-	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
-};
-static int __init find_node_by_addr(unsigned long addr)
-{
-	int ret = NUMA_NO_NODE;
-	int i;
-
-	for_each_node_mask(i, nodes_parsed) {
-		/*
-		 * Find the real node that this emulated node appears on.  For
-		 * the sake of simplicity, we only use a real node's starting
-		 * address to determine which emulated node it appears on.
-		 */
-		if (addr >= nodes[i].start && addr < nodes[i].end) {
-			ret = i;
-			break;
-		}
-	}
-	return ret;
+	ret = acpi_numa_init();
+	if (ret < 0)
+		return ret;
+	return srat_disabled() ? -EINVAL : 0;
 }
 
-/*
- * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
- * mappings that respect the real ACPI topology but reflect our emulated
- * environment.  For each emulated node, we find which real node it appears on
- * and create PXM to NID mappings for those fake nodes which mirror that
- * locality.  SLIT will now represent the correct distances between emulated
- * nodes as a result of the real topology.
- */
-void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
-{
-	int i, j;
-
-	for (i = 0; i < num_nodes; i++) {
-		int nid, pxm;
-
-		nid = find_node_by_addr(fake_nodes[i].start);
-		if (nid == NUMA_NO_NODE)
-			continue;
-		pxm = node_to_pxm(nid);
-		if (pxm == PXM_INVAL)
-			continue;
-		fake_node_to_pxm_map[i] = pxm;
-		/*
-		 * For each apicid_to_node mapping that exists for this real
-		 * node, it must now point to the fake node ID.
-		 */
-		for (j = 0; j < MAX_LOCAL_APIC; j++)
-			if (apicid_to_node[j] == nid &&
-			    fake_apicid_to_node[j] == NUMA_NO_NODE)
-				fake_apicid_to_node[j] = i;
-	}
-
-	/*
-	 * If there are apicid-to-node mappings for physical nodes that do not
-	 * have a corresponding emulated node, it should default to a guaranteed
-	 * value.
-	 */
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		if (apicid_to_node[i] != NUMA_NO_NODE &&
-		    fake_apicid_to_node[i] == NUMA_NO_NODE)
-			fake_apicid_to_node[i] = 0;
-
-	for (i = 0; i < num_nodes; i++)
-		__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
-	memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
-
-	nodes_clear(nodes_parsed);
-	for (i = 0; i < num_nodes; i++)
-		if (fake_nodes[i].start != fake_nodes[i].end)
-			node_set(i, nodes_parsed);
-}
-
-static int null_slit_node_compare(int a, int b)
-{
-	return node_to_pxm(a) == node_to_pxm(b);
-}
-#else
-static int null_slit_node_compare(int a, int b)
-{
-	return a == b;
-}
-#endif /* CONFIG_NUMA_EMU */
-
-int __node_distance(int a, int b)
-{
-	int index;
-
-	if (!acpi_slit)
-		return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
-						      REMOTE_DISTANCE;
-	index = acpi_slit->locality_count * node_to_pxm(a);
-	return acpi_slit->entry[index + node_to_pxm(b)];
-}
-
-EXPORT_SYMBOL(__node_distance);
-
 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
 int memory_add_physaddr_to_nid(u64 start)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6acc724d5d8f..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -179,12 +179,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
 	sender = this_cpu_read(tlb_vector_offset);
 	f = &flush_state[sender];
 
-	/*
-	 * Could avoid this lock when
-	 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-	 * probably not worth checking this for a cache-hot lock.
-	 */
-	raw_spin_lock(&f->tlbstate_lock);
+	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+		raw_spin_lock(&f->tlbstate_lock);
 
 	f->flush_mm = mm;
 	f->flush_va = va;
@@ -202,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
 
 	f->flush_mm = NULL;
 	f->flush_va = 0;
-	raw_spin_unlock(&f->tlbstate_lock);
+	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+		raw_spin_unlock(&f->tlbstate_lock);
 }
 
 void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -211,11 +208,10 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	if (is_uv_system()) {
 		unsigned int cpu;
 
-		cpu = get_cpu();
+		cpu = smp_processor_id();
 		cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
 		if (cpumask)
 			flush_tlb_others_ipi(cpumask, mm, va);
-		put_cpu();
 		return;
 	}
 	flush_tlb_others_ipi(cpumask, mm, va);
author	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2011-03-25 18:41:20 +0300
committer	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2011-03-25 18:41:20 +0300
commit	7bf7e370d5919112c223a269462cd0b546903829 (patch)
tree	03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /arch/x86/mm
parent	68b1a1e786f29c900fa1c516a402e24f0ece622a (diff)
parent	d39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff)
download	linux-7bf7e370d5919112c223a269462cd0b546903829.tar.xz