Merge branch 'linus' into core/urgent

author: Ingo Molnar <mingo@elte.hu> 2009-01-04 12:59:36 +0300
committer: Ingo Molnar <mingo@elte.hu> 2009-01-04 12:59:36 +0300
commit: 4010b0192ddf6ec7ec1b9feb9b0953692aeb7329 (patch)
tree: 188a36186f6ce580b479a9f90404fa7bfd8b22d7 /arch
parent: 79ff56ebd3edfb16f8badc558cb439b203a3298f (diff)
parent: 7d3b56ba37a95f1f370f50258ed3954c304c524b (diff)
download: linux-4010b0192ddf6ec7ec1b9feb9b0953692aeb7329.tar.xz
123 files changed, 1795 insertions, 803 deletions
diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h
index 149532e162c4..b4f284c72ff3 100644
--- a/arch/alpha/include/asm/topology.h
+++ b/arch/alpha/include/asm/topology.h
@@ -39,7 +39,24 @@ static inline cpumask_t node_to_cpumask(int node)
 	return node_cpu_mask;
 }
 
+extern struct cpumask node_to_cpumask_map[];
+/* FIXME: This is dumb, recalculating every time.  But simple. */
+static const struct cpumask *cpumask_of_node(int node)
+{
+	int cpu;
+
+	cpumask_clear(&node_to_cpumask_map[node]);
+
+	for_each_online_cpu(cpu) {
+		if (cpu_to_node(cpu) == node)
+			cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
+	}
+
+	return &node_to_cpumask_map[node];
+}
+
 #define pcibus_to_cpumask(bus)	(cpu_online_map)
+#define cpumask_of_pcibus(bus)	(cpu_online_mask)
 
 #endif /* !CONFIG_NUMA */
 # include <asm-generic/topology.h>
diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile
index ac706c1d7ada..b4697759a123 100644
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -8,7 +8,7 @@ EXTRA_CFLAGS	:= -Werror -Wno-sign-compare
 
 obj-y    := entry.o traps.o process.o init_task.o osf_sys.o irq.o \
 	    irq_alpha.o signal.o setup.o ptrace.o time.o \
-	    alpha_ksyms.o systbls.o err_common.o io.o
+	    alpha_ksyms.o systbls.o err_common.o io.o binfmt_loader.o
 
 obj-$(CONFIG_VGA_HOSE)	+= console.o
 obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c
new file mode 100644
index 000000000000..4a0af906b00a
--- /dev/null
+++ b/arch/alpha/kernel/binfmt_loader.c
@@ -0,0 +1,51 @@
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm_types.h>
+#include <linux/binfmts.h>
+#include <linux/a.out.h>
+
+static int load_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+{
+	struct exec *eh = (struct exec *)bprm->buf;
+	unsigned long loader;
+	struct file *file;
+	int retval;
+
+	if (eh->fh.f_magic != 0x183 || (eh->fh.f_flags & 0x3000) != 0x3000)
+		return -ENOEXEC;
+
+	if (bprm->loader)
+		return -ENOEXEC;
+
+	allow_write_access(bprm->file);
+	fput(bprm->file);
+	bprm->file = NULL;
+
+	loader = bprm->vma->vm_end - sizeof(void *);
+
+	file = open_exec("/sbin/loader");
+	retval = PTR_ERR(file);
+	if (IS_ERR(file))
+		return retval;
+
+	/* Remember if the application is TASO.  */
+	bprm->taso = eh->ah.entry < 0x100000000UL;
+
+	bprm->file = file;
+	bprm->loader = loader;
+	retval = prepare_binprm(bprm);
+	if (retval < 0)
+		return retval;
+	return search_binary_handler(bprm,regs);
+}
+
+static struct linux_binfmt loader_format = {
+	.load_binary	= load_binary,
+};
+
+static int __init init_loader_binfmt(void)
+{
+	return register_binfmt(&loader_format);
+}
+arch_initcall(init_loader_binfmt);
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index d0f1620007f7..703731accda6 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -50,7 +50,8 @@ int irq_select_affinity(unsigned int irq)
 	if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq])
 		return 1;
 
-	while (!cpu_possible(cpu) || !cpu_isset(cpu, irq_default_affinity))
+	while (!cpu_possible(cpu) ||
+	       !cpumask_test_cpu(cpu, irq_default_affinity))
 		cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
 	last_cpu = cpu;
 
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index a449e999027c..02bee6983ce2 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -79,6 +79,11 @@ int alpha_l3_cacheshape;
 unsigned long alpha_verbose_mcheck = CONFIG_VERBOSE_MCHECK_ON;
 #endif
 
+#ifdef CONFIG_NUMA
+struct cpumask node_to_cpumask_map[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_to_cpumask_map);
+#endif
+
 /* Which processor we booted from.  */
 int boot_cpuid;
 
diff --git a/arch/avr32/include/asm/bitops.h b/arch/avr32/include/asm/bitops.h
index 1a50b69b1a19..f7dd5f71edf7 100644
--- a/arch/avr32/include/asm/bitops.h
+++ b/arch/avr32/include/asm/bitops.h
@@ -263,6 +263,11 @@ static inline int fls(unsigned long word)
 	return 32 - result;
 }
 
+static inline int __fls(unsigned long word)
+{
+	return fls(word) - 1;
+}
+
 unsigned long find_first_zero_bit(const unsigned long *addr,
 				  unsigned long size);
 unsigned long find_next_zero_bit(const unsigned long *addr,
diff --git a/arch/blackfin/include/asm/bitops.h b/arch/blackfin/include/asm/bitops.h
index b39a175c79c1..c428e4106f89 100644
--- a/arch/blackfin/include/asm/bitops.h
+++ b/arch/blackfin/include/asm/bitops.h
@@ -213,6 +213,7 @@ static __inline__ int __test_bit(int nr, const void *addr)
 #endif				/* __KERNEL__ */
 
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 
 #endif				/* _BLACKFIN_BITOPS_H */
diff --git a/arch/cris/include/asm/bitops.h b/arch/cris/include/asm/bitops.h
index c0e62f811e09..9e69cfb7f134 100644
--- a/arch/cris/include/asm/bitops.h
+++ b/arch/cris/include/asm/bitops.h
@@ -148,6 +148,7 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
 #define ffs kernel_ffs
 
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/find.h>
diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h
index cb18e3b0aa94..cb9ddf5fc54f 100644
--- a/arch/h8300/include/asm/bitops.h
+++ b/arch/h8300/include/asm/bitops.h
@@ -207,6 +207,7 @@ static __inline__ unsigned long __ffs(unsigned long word)
 #endif /* __KERNEL__ */
 
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 
 #endif /* _H8300_BITOPS_H */
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 7fa8f615ba6e..3d31636cbafb 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -687,3 +687,6 @@ config IRQ_PER_CPU
 
 config IOMMU_HELPER
 	def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC || SWIOTLB)
+
+config IOMMU_API
+	def_bool (DMAR)
diff --git a/arch/ia64/include/asm/irq.h b/arch/ia64/include/asm/irq.h
index 3627116fb0e2..36429a532630 100644
--- a/arch/ia64/include/asm/irq.h
+++ b/arch/ia64/include/asm/irq.h
@@ -27,7 +27,7 @@ irq_canonicalize (int irq)
 }
 
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
-bool is_affinity_mask_valid(cpumask_t cpumask);
+bool is_affinity_mask_valid(cpumask_var_t cpumask);
 
 #define is_affinity_mask_valid is_affinity_mask_valid
 
diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 0560f3fae538..348663661659 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_arch {
 	struct kvm_sal_data rdv_sal_data;
 
 	struct list_head assigned_dev_head;
-	struct dmar_domain *intel_iommu_domain;
+	struct iommu_domain *iommu_domain;
 	struct hlist_head irq_ack_notifier_list;
 
 	unsigned long irq_sources_bitmap;
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index a3cc9f65f954..76a33a91ca69 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -34,6 +34,7 @@
  * Returns a bitmask of CPUs on Node 'node'.
  */
 #define node_to_cpumask(node) (node_to_cpu_mask[node])
+#define cpumask_of_node(node) (&node_to_cpu_mask[node])
 
 /*
  * Returns the number of the node containing Node 'nid'.
@@ -45,7 +46,7 @@
 /*
  * Returns the number of the first CPU on Node 'node'.
  */
-#define node_to_first_cpu(node) (first_cpu(node_to_cpumask(node)))
+#define node_to_first_cpu(node) (cpumask_first(cpumask_of_node(node)))
 
 /*
  * Determines the node for a given pci bus
@@ -109,6 +110,8 @@ void build_cpu_to_node_map(void);
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
+#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #define smt_capable() 				(smp_num_siblings > 1)
 #endif
 
@@ -119,6 +122,10 @@ extern void arch_fix_phys_package_id(int num, u32 slot);
 					node_to_cpumask(pcibus_to_node(bus)) \
 				)
 
+#define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
+				 cpu_all_mask :				\
+				 cpumask_from_node(pcibus_to_node(bus)))
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_IA64_TOPOLOGY_H */
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index bd7acc71e8a9..0553648b7595 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -202,7 +202,6 @@ char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
                             Boot-time Table Parsing
    -------------------------------------------------------------------------- */
 
-static int total_cpus __initdata;
 static int available_cpus __initdata;
 struct acpi_table_madt *acpi_madt __initdata;
 static u8 has_8259;
@@ -1001,7 +1000,7 @@ acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret)
 	node = pxm_to_node(pxm);
 
 	if (node >= MAX_NUMNODES || !node_online(node) ||
-	    cpus_empty(node_to_cpumask(node)))
+	    cpumask_empty(cpumask_of_node(node)))
 		return AE_OK;
 
 	/* We know a gsi to node mapping! */
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index c8adecd5b416..5cfd3d91001a 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -695,32 +695,31 @@ get_target_cpu (unsigned int gsi, int irq)
 #ifdef CONFIG_NUMA
 	{
 		int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
-		cpumask_t cpu_mask;
+		const struct cpumask *cpu_mask;
 
 		iosapic_index = find_iosapic(gsi);
 		if (iosapic_index < 0 ||
 		    iosapic_lists[iosapic_index].node == MAX_NUMNODES)
 			goto skip_numa_setup;
 
-		cpu_mask = node_to_cpumask(iosapic_lists[iosapic_index].node);
-		cpus_and(cpu_mask, cpu_mask, domain);
-		for_each_cpu_mask(numa_cpu, cpu_mask) {
-			if (!cpu_online(numa_cpu))
-				cpu_clear(numa_cpu, cpu_mask);
+		cpu_mask = cpumask_of_node(iosapic_lists[iosapic_index].node);
+		num_cpus = 0;
+		for_each_cpu_and(numa_cpu, cpu_mask, &domain) {
+			if (cpu_online(numa_cpu))
+				num_cpus++;
 		}
 
-		num_cpus = cpus_weight(cpu_mask);
-
 		if (!num_cpus)
 			goto skip_numa_setup;
 
 		/* Use irq assignment to distribute across cpus in node */
 		cpu_index = irq % num_cpus;
 
-		for (numa_cpu = first_cpu(cpu_mask) ; i < cpu_index ; i++)
-			numa_cpu = next_cpu(numa_cpu, cpu_mask);
+		for_each_cpu_and(numa_cpu, cpu_mask, &domain)
+			if (cpu_online(numa_cpu) && i++ >= cpu_index)
+				break;
 
-		if (numa_cpu != NR_CPUS)
+		if (numa_cpu < nr_cpu_ids)
 			return cpu_physical_id(numa_cpu);
 	}
 skip_numa_setup:
@@ -731,7 +730,7 @@ skip_numa_setup:
 	 * case of NUMA.)
 	 */
 	do {
-		if (++cpu >= NR_CPUS)
+		if (++cpu >= nr_cpu_ids)
 			cpu = 0;
 	} while (!cpu_online(cpu) || !cpu_isset(cpu, domain));
 
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 0b6db53fedcf..95ff16cb05d8 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -112,11 +112,11 @@ void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 	}
 }
 
-bool is_affinity_mask_valid(cpumask_t cpumask)
+bool is_affinity_mask_valid(cpumask_var_t cpumask)
 {
 	if (ia64_platform_is("sn2")) {
 		/* Only allow one CPU to be specified in the smp_affinity mask */
-		if (cpus_weight(cpumask) != 1)
+		if (cpumask_weight(cpumask) != 1)
 			return false;
 	}
 	return true;
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 65c10a42c88f..f0ebb342409d 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -93,13 +93,14 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next)
 	now = ia64_get_itc();
 
 	delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp));
-	account_system_time(prev, 0, delta_stime);
-	account_system_time_scaled(prev, delta_stime);
+	if (idle_task(smp_processor_id()) != prev)
+		account_system_time(prev, 0, delta_stime, delta_stime);
+	else
+		account_idle_time(delta_stime);
 
 	if (pi->ac_utime) {
 		delta_utime = cycle_to_cputime(pi->ac_utime);
-		account_user_time(prev, delta_utime);
-		account_user_time_scaled(prev, delta_utime);
+		account_user_time(prev, delta_utime, delta_utime);
 	}
 
 	pi->ac_stamp = ni->ac_stamp = now;
@@ -122,8 +123,10 @@ void account_system_vtime(struct task_struct *tsk)
 	now = ia64_get_itc();
 
 	delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
-	account_system_time(tsk, 0, delta_stime);
-	account_system_time_scaled(tsk, delta_stime);
+	if (irq_count() || idle_task(smp_processor_id()) != tsk)
+		account_system_time(tsk, 0, delta_stime, delta_stime);
+	else
+		account_idle_time(delta_stime);
 	ti->ac_stime = 0;
 
 	ti->ac_stamp = now;
@@ -143,8 +146,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
 
 	if (ti->ac_utime) {
 		delta_utime = cycle_to_cputime(ti->ac_utime);
-		account_user_time(p, delta_utime);
-		account_user_time_scaled(p, delta_utime);
+		account_user_time(p, delta_utime, delta_utime);
 		ti->ac_utime = 0;
 	}
 }
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 76464dc312e6..0bb99b732908 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -51,8 +51,8 @@ EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 		coalesced_mmio.o irq_comm.o)
 
-ifeq ($(CONFIG_DMAR),y)
-common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+ifeq ($(CONFIG_IOMMU_API),y)
+common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
 endif
 
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0f5ebd948437..4e586f6110aa 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/hrtimer.h>
 #include <linux/uaccess.h>
+#include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
 #include <asm/pgtable.h>
@@ -188,7 +189,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
 	case KVM_CAP_IOMMU:
-		r = intel_iommu_found();
+		r = iommu_found();
 		break;
 	default:
 		r = 0;
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 636588e7e068..be339477f906 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -385,7 +385,6 @@ static int sn_topology_show(struct seq_file *s, void *d)
 	int j;
 	const char *slabname;
 	int ordinal;
-	cpumask_t cpumask;
 	char slice;
 	struct cpuinfo_ia64 *c;
 	struct sn_hwperf_port_info *ptdata;
@@ -473,23 +472,21 @@ static int sn_topology_show(struct seq_file *s, void *d)
 		 * CPUs on this node, if any
 		 */
 		if (!SN_HWPERF_IS_IONODE(obj)) {
-			cpumask = node_to_cpumask(ordinal);
-			for_each_online_cpu(i) {
-				if (cpu_isset(i, cpumask)) {
-					slice = 'a' + cpuid_to_slice(i);
-					c = cpu_data(i);
-					seq_printf(s, "cpu %d %s%c local"
-						" freq %luMHz, arch ia64",
-						i, obj->location, slice,
-						c->proc_freq / 1000000);
-					for_each_online_cpu(j) {
-						seq_printf(s, j ? ":%d" : ", dist %d",
-							node_distance(
+			for_each_cpu_and(i, cpu_online_mask,
+					 cpumask_of_node(ordinal)) {
+				slice = 'a' + cpuid_to_slice(i);
+				c = cpu_data(i);
+				seq_printf(s, "cpu %d %s%c local"
+					   " freq %luMHz, arch ia64",
+					   i, obj->location, slice,
+					   c->proc_freq / 1000000);
+				for_each_online_cpu(j) {
+					seq_printf(s, j ? ":%d" : ", dist %d",
+						   node_distance(
 						    	cpu_to_node(i),
 						    	cpu_to_node(j)));
-					}
-					seq_putc(s, '\n');
 				}
+				seq_putc(s, '\n');
 			}
 		}
 	}
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index 0f06b3722e96..2547d6c4a827 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -592,7 +592,7 @@ int setup_profiling_timer(unsigned int multiplier)
 	 * accounting. At that time they also adjust their APIC timers
 	 * accordingly.
 	 */
-	for (i = 0; i < NR_CPUS; ++i)
+	for_each_possible_cpu(i)
 		per_cpu(prof_multiplier, i) = multiplier;
 
 	return 0;
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 836fb66f080d..c825bde17cb3 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -280,7 +280,6 @@ config M68060
 
 config MMU_MOTOROLA
 	bool
-	depends on MMU && !MMU_SUN3
 
 config MMU_SUN3
 	bool
diff --git a/arch/m68knommu/include/asm/bitops.h b/arch/m68knommu/include/asm/bitops.h
index 6f3685eab44c..9d3cbe5fad1e 100644
--- a/arch/m68knommu/include/asm/bitops.h
+++ b/arch/m68knommu/include/asm/bitops.h
@@ -331,6 +331,7 @@ found_middle:
 #endif /* __KERNEL__ */
 
 #include <asm-generic/bitops/fls.h>
+#include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls64.h>
 
 #endif /* _M68KNOMMU_BITOPS_H */
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 1fb959f98982..55d481569a1f 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -25,11 +25,13 @@ extern struct cpuinfo_ip27 sn_cpu_info[NR_CPUS];
 #define cpu_to_node(cpu)	(sn_cpu_info[(cpu)].p_nodeid)
 #define parent_node(node)	(node)
 #define node_to_cpumask(node)	(hub_data(node)->h_cpus)
-#define node_to_first_cpu(node)	(first_cpu(node_to_cpumask(node)))
+#define cpumask_of_node(node)	(&hub_data(node)->h_cpus)
+#define node_to_first_cpu(node)	(cpumask_first(cpumask_of_node(node)))
 struct pci_bus;
 extern int pcibus_to_node(struct pci_bus *);
 
 #define pcibus_to_cpumask(bus)	(cpu_online_map)
+#define cpumask_of_pcibus(bus)	(cpu_online_mask)
 
 extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 
diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h
index 409e698f4361..6ef4b7867b1b 100644
--- a/arch/parisc/include/asm/smp.h
+++ b/arch/parisc/include/asm/smp.h
@@ -16,8 +16,6 @@
 #include <linux/cpumask.h>
 typedef unsigned long address_t;
 
-extern cpumask_t cpu_online_map;
-
 
 /*
  *	Private routines/data
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 373fca394a54..375258559ae6 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -22,11 +22,11 @@ static inline cpumask_t node_to_cpumask(int node)
 	return numa_cpumask_lookup_table[node];
 }
 
+#define cpumask_of_node(node) (&numa_cpumask_lookup_table[node])
+
 static inline int node_to_first_cpu(int node)
 {
-	cpumask_t tmp;
-	tmp = node_to_cpumask(node);
-	return first_cpu(tmp);
+	return cpumask_first(cpumask_of_node(node));
 }
 
 int of_node_to_nid(struct device_node *device);
@@ -46,6 +46,10 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 					node_to_cpumask(pcibus_to_node(bus)) \
 				)
 
+#define cpumask_of_pcibus(bus)	(pcibus_to_node(bus) == -1 ?		\
+				 cpu_all_mask :				\
+				 cpumask_of_node(pcibus_to_node(bus)))
+
 /* sched_domains SD_NODE_INIT for PPC64 machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.parent			= NULL,			\
@@ -108,6 +112,8 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev,
 
 #define topology_thread_siblings(cpu)	(per_cpu(cpu_sibling_map, cpu))
 #define topology_core_siblings(cpu)	(per_cpu(cpu_core_map, cpu))
+#define topology_thread_cpumask(cpu)	(&per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)	(&per_cpu(cpu_core_map, cpu))
 #define topology_core_id(cpu)		(cpu_to_core_id(cpu))
 #endif
 #endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 51b201ddf9a1..fb7049c054c0 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -33,6 +33,7 @@
 #include <linux/mqueue.h>
 #include <linux/hardirq.h>
 #include <linux/utsname.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 99f1ddd68582..c9564031a2a9 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -256,8 +256,10 @@ void account_system_vtime(struct task_struct *tsk)
 		delta += sys_time;
 		get_paca()->system_time = 0;
 	}
-	account_system_time(tsk, 0, delta);
-	account_system_time_scaled(tsk, deltascaled);
+	if (in_irq() || idle_task(smp_processor_id()) != tsk)
+		account_system_time(tsk, 0, delta, deltascaled);
+	else
+		account_idle_time(delta);
 	per_cpu(cputime_last_delta, smp_processor_id()) = delta;
 	per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled;
 	local_irq_restore(flags);
@@ -275,10 +277,8 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
 
 	utime = get_paca()->user_time;
 	get_paca()->user_time = 0;
-	account_user_time(tsk, utime);
-
 	utimescaled = cputime_to_scaled(utime);
-	account_user_time_scaled(tsk, utimescaled);
+	account_user_time(tsk, utime, utimescaled);
 }
 
 /*
@@ -338,8 +338,12 @@ void calculate_steal_time(void)
 	tb = mftb();
 	purr = mfspr(SPRN_PURR);
 	stolen = (tb - pme->tb) - (purr - pme->purr);
-	if (stolen > 0)
-		account_steal_time(current, stolen);
+	if (stolen > 0) {
+		if (idle_task(smp_processor_id()) != current)
+			account_steal_time(stolen);
+		else
+			account_idle_time(stolen);
+	}
 	pme->tb = tb;
 	pme->purr = purr;
 }
diff --git a/arch/powerpc/platforms/cell/spu_priv1_mmio.c b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
index 906a0a2a9fe1..1410443731eb 100644
--- a/arch/powerpc/platforms/cell/spu_priv1_mmio.c
+++ b/arch/powerpc/platforms/cell/spu_priv1_mmio.c
@@ -80,10 +80,10 @@ static void cpu_affinity_set(struct spu *spu, int cpu)
 	u64 route;
 
 	if (nr_cpus_node(spu->node)) {
-		cpumask_t spumask = node_to_cpumask(spu->node);
-		cpumask_t cpumask = node_to_cpumask(cpu_to_node(cpu));
+		const struct cpumask *spumask = cpumask_of_node(spu->node),
+			*cpumask = cpumask_of_node(cpu_to_node(cpu));
 
-		if (!cpus_intersects(spumask, cpumask))
+		if (!cpumask_intersects(spumask, cpumask))
 			return;
 	}
 
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 2ad914c47493..6a0ad196aeb3 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -166,9 +166,9 @@ void spu_update_sched_info(struct spu_context *ctx)
 static int __node_allowed(struct spu_context *ctx, int node)
 {
 	if (nr_cpus_node(node)) {
-		cpumask_t mask = node_to_cpumask(node);
+		const struct cpumask *mask = cpumask_of_node(node);
 
-		if (cpus_intersects(mask, ctx->cpus_allowed))
+		if (cpumask_intersects(mask, &ctx->cpus_allowed))
 			return 1;
 	}
 
diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h
index e5a6a9ba3adf..d60a2eefb17b 100644
--- a/arch/s390/include/asm/cpu.h
+++ b/arch/s390/include/asm/cpu.h
@@ -14,7 +14,6 @@
 
 struct s390_idle_data {
 	spinlock_t lock;
-	unsigned int in_idle;
 	unsigned long long idle_count;
 	unsigned long long idle_enter;
 	unsigned long long idle_time;
@@ -22,12 +21,12 @@ struct s390_idle_data {
 
 DECLARE_PER_CPU(struct s390_idle_data, s390_idle);
 
-void s390_idle_leave(void);
+void vtime_start_cpu(void);
 
 static inline void s390_idle_check(void)
 {
-	if ((&__get_cpu_var(s390_idle))->in_idle)
-		s390_idle_leave();
+	if ((&__get_cpu_var(s390_idle))->idle_enter != 0ULL)
+		vtime_start_cpu();
 }
 
 #endif /* _ASM_S390_CPU_H_ */
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 133ce054fc89..521726430afa 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -11,7 +11,7 @@
 
 #include <asm/div64.h>
 
-/* We want to use micro-second resolution. */
+/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
 
 typedef unsigned long long cputime_t;
 typedef unsigned long long cputime64_t;
@@ -53,9 +53,9 @@ __div(unsigned long long n, unsigned int base)
 #define cputime_ge(__a, __b)		((__a) >= (__b))
 #define cputime_lt(__a, __b)		((__a) <  (__b))
 #define cputime_le(__a, __b)		((__a) <= (__b))
-#define cputime_to_jiffies(__ct)	(__div((__ct), 1000000 / HZ))
+#define cputime_to_jiffies(__ct)	(__div((__ct), 4096000000ULL / HZ))
 #define cputime_to_scaled(__ct)		(__ct)
-#define jiffies_to_cputime(__hz)	((cputime_t)(__hz) * (1000000 / HZ))
+#define jiffies_to_cputime(__hz)	((cputime_t)(__hz) * (4096000000ULL / HZ))
 
 #define cputime64_zero			(0ULL)
 #define cputime64_add(__a, __b)		((__a) + (__b))
@@ -64,7 +64,7 @@ __div(unsigned long long n, unsigned int base)
 static inline u64
 cputime64_to_jiffies64(cputime64_t cputime)
 {
-	do_div(cputime, 1000000 / HZ);
+	do_div(cputime, 4096000000ULL / HZ);
 	return cputime;
 }
 
@@ -74,13 +74,13 @@ cputime64_to_jiffies64(cputime64_t cputime)
 static inline unsigned int
 cputime_to_msecs(const cputime_t cputime)
 {
-	return __div(cputime, 1000);
+	return __div(cputime, 4096000);
 }
 
 static inline cputime_t
 msecs_to_cputime(const unsigned int m)
 {
-	return (cputime_t) m * 1000;
+	return (cputime_t) m * 4096000;
 }
 
 /*
@@ -89,13 +89,13 @@ msecs_to_cputime(const unsigned int m)
 static inline unsigned int
 cputime_to_secs(const cputime_t cputime)
 {
-	return __div(cputime, 1000000);
+	return __div(cputime, 2048000000) >> 1;
 }
 
 static inline cputime_t
 secs_to_cputime(const unsigned int s)
 {
-	return (cputime_t) s * 1000000;
+	return (cputime_t) s * 4096000000ULL;
 }
 
 /*
@@ -104,7 +104,7 @@ secs_to_cputime(const unsigned int s)
 static inline cputime_t
 timespec_to_cputime(const struct timespec *value)
 {
-        return value->tv_nsec / 1000 + (u64) value->tv_sec * 1000000;
+	return value->tv_nsec * 4096 / 1000 + (u64) value->tv_sec * 4096000000ULL;
 }
 
 static inline void
@@ -114,12 +114,12 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value)
 	register_pair rp;
 
 	rp.pair = cputime >> 1;
-	asm ("dr %0,%1" : "+d" (rp) : "d" (1000000 >> 1));
-	value->tv_nsec = rp.subreg.even * 1000;
+	asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
+	value->tv_nsec = rp.subreg.even * 1000 / 4096;
 	value->tv_sec = rp.subreg.odd;
 #else
-	value->tv_nsec = (cputime % 1000000) * 1000;
-	value->tv_sec = cputime / 1000000;
+	value->tv_nsec = (cputime % 4096000000ULL) * 1000 / 4096;
+	value->tv_sec = cputime / 4096000000ULL;
 #endif
 }
 
@@ -131,7 +131,7 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value)
 static inline cputime_t
 timeval_to_cputime(const struct timeval *value)
 {
-        return value->tv_usec + (u64) value->tv_sec * 1000000;
+	return value->tv_usec * 4096 + (u64) value->tv_sec * 4096000000ULL;
 }
 
 static inline void
@@ -141,12 +141,12 @@ cputime_to_timeval(const cputime_t cputime, struct timeval *value)
 	register_pair rp;
 
 	rp.pair = cputime >> 1;
-	asm ("dr %0,%1" : "+d" (rp) : "d" (1000000 >> 1));
-	value->tv_usec = rp.subreg.even;
+	asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
+	value->tv_usec = rp.subreg.even / 4096;
 	value->tv_sec = rp.subreg.odd;
 #else
-	value->tv_usec = cputime % 1000000;
-	value->tv_sec = cputime / 1000000;
+	value->tv_usec = cputime % 4096000000ULL;
+	value->tv_sec = cputime / 4096000000ULL;
 #endif
 }
 
@@ -156,13 +156,13 @@ cputime_to_timeval(const cputime_t cputime, struct timeval *value)
 static inline clock_t
 cputime_to_clock_t(cputime_t cputime)
 {
-	return __div(cputime, 1000000 / USER_HZ);
+	return __div(cputime, 4096000000ULL / USER_HZ);
 }
 
 static inline cputime_t
 clock_t_to_cputime(unsigned long x)
 {
-	return (cputime_t) x * (1000000 / USER_HZ);
+	return (cputime_t) x * (4096000000ULL / USER_HZ);
 }
 
 /*
@@ -171,7 +171,7 @@ clock_t_to_cputime(unsigned long x)
 static inline clock_t
 cputime64_to_clock_t(cputime64_t cputime)
 {
-       return __div(cputime, 1000000 / USER_HZ);
+       return __div(cputime, 4096000000ULL / USER_HZ);
 }
 
 #endif /* _S390_CPUTIME_H */
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 0bc51d52a899..ffdef5fe8587 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -67,11 +67,11 @@
 #define __LC_SYNC_ENTER_TIMER		0x248
 #define __LC_ASYNC_ENTER_TIMER		0x250
 #define __LC_EXIT_TIMER			0x258
-#define __LC_LAST_UPDATE_TIMER		0x260
-#define __LC_USER_TIMER			0x268
-#define __LC_SYSTEM_TIMER		0x270
-#define __LC_LAST_UPDATE_CLOCK		0x278
-#define __LC_STEAL_CLOCK		0x280
+#define __LC_USER_TIMER			0x260
+#define __LC_SYSTEM_TIMER		0x268
+#define __LC_STEAL_TIMER		0x270
+#define __LC_LAST_UPDATE_TIMER		0x278
+#define __LC_LAST_UPDATE_CLOCK		0x280
 #define __LC_RETURN_MCCK_PSW            0x288
 #define __LC_KERNEL_STACK               0xC40
 #define __LC_THREAD_INFO		0xC44
@@ -89,11 +89,11 @@
 #define __LC_SYNC_ENTER_TIMER		0x250
 #define __LC_ASYNC_ENTER_TIMER		0x258
 #define __LC_EXIT_TIMER			0x260
-#define __LC_LAST_UPDATE_TIMER		0x268
-#define __LC_USER_TIMER			0x270
-#define __LC_SYSTEM_TIMER		0x278
-#define __LC_LAST_UPDATE_CLOCK		0x280
-#define __LC_STEAL_CLOCK		0x288
+#define __LC_USER_TIMER			0x268
+#define __LC_SYSTEM_TIMER		0x270
+#define __LC_STEAL_TIMER		0x278
+#define __LC_LAST_UPDATE_TIMER		0x280
+#define __LC_LAST_UPDATE_CLOCK		0x288
 #define __LC_RETURN_MCCK_PSW            0x290
 #define __LC_KERNEL_STACK               0xD40
 #define __LC_THREAD_INFO		0xD48
@@ -106,8 +106,10 @@
 #define __LC_IPLDEV                     0xDB8
 #define __LC_CURRENT			0xDD8
 #define __LC_INT_CLOCK			0xDE8
+#define __LC_VDSO_PER_CPU		0xE38
 #endif /* __s390x__ */
 
+#define __LC_PASTE			0xE40
 
 #define __LC_PANIC_MAGIC		0xE00
 #ifndef __s390x__
@@ -252,11 +254,11 @@ struct _lowcore
 	__u64        sync_enter_timer;         /* 0x248 */
 	__u64        async_enter_timer;        /* 0x250 */
 	__u64        exit_timer;               /* 0x258 */
-	__u64        last_update_timer;        /* 0x260 */
-	__u64        user_timer;               /* 0x268 */
-	__u64        system_timer;             /* 0x270 */
-	__u64        last_update_clock;        /* 0x278 */
-	__u64        steal_clock;              /* 0x280 */
+	__u64	     user_timer;	       /* 0x260 */
+	__u64	     system_timer;	       /* 0x268 */
+	__u64	     steal_timer;	       /* 0x270 */
+	__u64	     last_update_timer;        /* 0x278 */
+	__u64	     last_update_clock;        /* 0x280 */
         psw_t        return_mcck_psw;          /* 0x288 */
 	__u8         pad8[0xc00-0x290];        /* 0x290 */
 
@@ -343,11 +345,11 @@ struct _lowcore
 	__u64        sync_enter_timer;         /* 0x250 */
 	__u64        async_enter_timer;        /* 0x258 */
 	__u64        exit_timer;               /* 0x260 */
-	__u64        last_update_timer;        /* 0x268 */
-	__u64        user_timer;               /* 0x270 */
-	__u64        system_timer;             /* 0x278 */
-	__u64        last_update_clock;        /* 0x280 */
-	__u64        steal_clock;              /* 0x288 */
+	__u64	     user_timer;	       /* 0x268 */
+	__u64	     system_timer;	       /* 0x270 */
+	__u64	     steal_timer;	       /* 0x278 */
+	__u64	     last_update_timer;        /* 0x280 */
+	__u64	     last_update_clock;        /* 0x288 */
         psw_t        return_mcck_psw;          /* 0x290 */
         __u8         pad8[0xc00-0x2a0];        /* 0x2a0 */
         /* System info area */
@@ -381,7 +383,12 @@ struct _lowcore
         /* whether the kernel died with panic() or not */
         __u32        panic_magic;              /* 0xe00 */
 
-	__u8         pad13[0x11b8-0xe04];      /* 0xe04 */
+	/* Per cpu primary space access list */
+	__u8	     pad_0xe04[0xe3c-0xe04];   /* 0xe04 */
+	__u32	     vdso_per_cpu_data;	       /* 0xe3c */
+	__u32	     paste[16];		       /* 0xe40 */
+
+	__u8	     pad13[0x11b8-0xe80];      /* 0xe80 */
 
 	/* 64 bit extparam used for pfault, diag 250 etc  */
 	__u64        ext_params2;               /* 0x11B8 */
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index 024ef42ed6d7..3a8b26eb1f2e 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -99,7 +99,7 @@ static inline void restore_access_regs(unsigned int *acrs)
 	prev = __switch_to(prev,next);					     \
 } while (0)
 
-extern void account_vtime(struct task_struct *);
+extern void account_vtime(struct task_struct *, struct task_struct *);
 extern void account_tick_vtime(struct task_struct *);
 extern void account_system_vtime(struct task_struct *);
 
@@ -121,7 +121,7 @@ static inline void cmma_init(void) { }
 
 #define finish_arch_switch(prev) do {					     \
 	set_fs(current->thread.mm_segment);				     \
-	account_vtime(prev);						     \
+	account_vtime(prev, current);					     \
 } while (0)
 
 #define nop() asm volatile("nop")
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index c1eaf9604da7..c544aa524535 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -47,6 +47,8 @@ struct thread_info {
 	unsigned int		cpu;		/* current CPU */
 	int			preempt_count;	/* 0 => preemptable, <0 => BUG */
 	struct restart_block	restart_block;
+	__u64			user_timer;
+	__u64			system_timer;
 };
 
 /*
diff --git a/arch/s390/include/asm/timer.h b/arch/s390/include/asm/timer.h
index 61705d60f995..e4bcab739c19 100644
--- a/arch/s390/include/asm/timer.h
+++ b/arch/s390/include/asm/timer.h
@@ -23,20 +23,18 @@ struct vtimer_list {
 	__u64 expires;
 	__u64 interval;
 
-	spinlock_t lock;
-	unsigned long magic;
-
 	void (*function)(unsigned long);
 	unsigned long data;
 };
 
-/* the offset value will wrap after ca. 71 years */
+/* the vtimer value will wrap after ca. 71 years */
 struct vtimer_queue {
 	struct list_head list;
 	spinlock_t lock;
-	__u64 to_expire;	  /* current event expire time */
-	__u64 offset;		  /* list offset to zero */
-	__u64 idle;		  /* temp var for idle */
+	__u64 timer;		/* last programmed timer */
+	__u64 elapsed;		/* elapsed time of timer expire values */
+	__u64 idle;		/* temp var for idle */
+	int do_spt;		/* =1: reprogram cpu timer in idle */
 };
 
 extern void init_virt_timer(struct vtimer_list *timer);
@@ -48,8 +46,8 @@ extern int del_virt_timer(struct vtimer_list *timer);
 extern void init_cpu_vtimer(void);
 extern void vtime_init(void);
 
-extern void vtime_start_cpu_timer(void);
-extern void vtime_stop_cpu_timer(void);
+extern void vtime_stop_cpu(void);
+extern void vtime_start_leave(void);
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index d96c91643458..c93eb50e1d09 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -6,10 +6,12 @@
 #define mc_capable()	(1)
 
 cpumask_t cpu_coregroup_map(unsigned int cpu);
+const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
 
 extern cpumask_t cpu_core_map[NR_CPUS];
 
 #define topology_core_siblings(cpu)	(cpu_core_map[cpu])
+#define topology_core_cpumask(cpu)	(&cpu_core_map[cpu])
 
 int topology_set_cpu_management(int fc);
 void topology_schedule_update(void);
diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h
index a44f4fe16a35..7bdd7c8ebc91 100644
--- a/arch/s390/include/asm/vdso.h
+++ b/arch/s390/include/asm/vdso.h
@@ -12,9 +12,9 @@
 #ifndef __ASSEMBLY__
 
 /*
- * Note about this structure:
+ * Note about the vdso_data and vdso_per_cpu_data structures:
  *
- * NEVER USE THIS IN USERSPACE CODE DIRECTLY. The layout of this
+ * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
  * structure is supposed to be known only to the function in the vdso
  * itself and may change without notice.
  */
@@ -28,10 +28,21 @@ struct vdso_data {
 	__u64 wtom_clock_nsec;		/*				0x28 */
 	__u32 tz_minuteswest;		/* Minutes west of Greenwich	0x30 */
 	__u32 tz_dsttime;		/* Type of dst correction	0x34 */
+	__u32 ectg_available;
+};
+
+struct vdso_per_cpu_data {
+	__u64 ectg_timer_base;
+	__u64 ectg_user_time;
 };
 
 extern struct vdso_data *vdso_data;
 
+#ifdef CONFIG_64BIT
+int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore);
+void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore);
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index e641f60bac99..67a60016babb 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -48,6 +48,11 @@ int main(void)
 	DEFINE(__VDSO_WTOM_SEC, offsetof(struct vdso_data, wtom_clock_sec));
 	DEFINE(__VDSO_WTOM_NSEC, offsetof(struct vdso_data, wtom_clock_nsec));
 	DEFINE(__VDSO_TIMEZONE, offsetof(struct vdso_data, tz_minuteswest));
+	DEFINE(__VDSO_ECTG_OK, offsetof(struct vdso_data, ectg_available));
+	DEFINE(__VDSO_ECTG_BASE,
+	       offsetof(struct vdso_per_cpu_data, ectg_timer_base));
+	DEFINE(__VDSO_ECTG_USER,
+	       offsetof(struct vdso_per_cpu_data, ectg_user_time));
 	/* constants used by the vdso */
 	DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
 	DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 55de521aef77..1268aa2991bf 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -583,8 +583,8 @@ kernel_per:
 
 	.globl io_int_handler
 io_int_handler:
-	stpt	__LC_ASYNC_ENTER_TIMER
 	stck	__LC_INT_CLOCK
+	stpt	__LC_ASYNC_ENTER_TIMER
 	SAVE_ALL_BASE __LC_SAVE_AREA+16
 	SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+16
 	CREATE_STACK_FRAME __LC_IO_OLD_PSW,__LC_SAVE_AREA+16
@@ -723,8 +723,8 @@ io_notify_resume:
 
 	.globl	ext_int_handler
 ext_int_handler:
-	stpt	__LC_ASYNC_ENTER_TIMER
 	stck	__LC_INT_CLOCK
+	stpt	__LC_ASYNC_ENTER_TIMER
 	SAVE_ALL_BASE __LC_SAVE_AREA+16
 	SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+16
 	CREATE_STACK_FRAME __LC_EXT_OLD_PSW,__LC_SAVE_AREA+16
@@ -750,6 +750,7 @@ __critical_end:
 
 	.globl mcck_int_handler
 mcck_int_handler:
+	stck	__LC_INT_CLOCK
 	spt	__LC_CPU_TIMER_SAVE_AREA	# revalidate cpu timer
 	lm	%r0,%r15,__LC_GPREGS_SAVE_AREA	# revalidate gprs
 	SAVE_ALL_BASE __LC_SAVE_AREA+32
diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S
index 16bb4fd1a403..c6fbde13971a 100644
--- a/arch/s390/kernel/entry64.S
+++ b/arch/s390/kernel/entry64.S
@@ -177,8 +177,11 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \
 	.if !\sync
 	ni	\psworg+1,0xfd		# clear wait state bit
 	.endif
-	lmg	%r0,%r15,SP_R0(%r15)	# load gprs 0-15 of user
+	lg	%r14,__LC_VDSO_PER_CPU
+	lmg	%r0,%r13,SP_R0(%r15)	# load gprs 0-13 of user
 	stpt	__LC_EXIT_TIMER
+	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
+	lmg	%r14,%r15,SP_R14(%r15)	# load grps 14-15 of user
 	lpswe	\psworg			# back to caller
 	.endm
 
@@ -559,8 +562,8 @@ kernel_per:
  */
 	.globl io_int_handler
 io_int_handler:
-	stpt	__LC_ASYNC_ENTER_TIMER
 	stck	__LC_INT_CLOCK
+	stpt	__LC_ASYNC_ENTER_TIMER
 	SAVE_ALL_BASE __LC_SAVE_AREA+32
 	SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+32
 	CREATE_STACK_FRAME __LC_IO_OLD_PSW,__LC_SAVE_AREA+32
@@ -721,8 +724,8 @@ io_notify_resume:
  */
 	.globl	ext_int_handler
 ext_int_handler:
-	stpt	__LC_ASYNC_ENTER_TIMER
 	stck	__LC_INT_CLOCK
+	stpt	__LC_ASYNC_ENTER_TIMER
 	SAVE_ALL_BASE __LC_SAVE_AREA+32
 	SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+32
 	CREATE_STACK_FRAME __LC_EXT_OLD_PSW,__LC_SAVE_AREA+32
@@ -746,6 +749,7 @@ __critical_end:
  */
 	.globl mcck_int_handler
 mcck_int_handler:
+	stck	__LC_INT_CLOCK
 	la	%r1,4095		# revalidate r1
 	spt	__LC_CPU_TIMER_SAVE_AREA-4095(%r1)	# revalidate cpu timer
 	lmg	%r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs
@@ -979,23 +983,23 @@ cleanup_sysc_return:
 
 cleanup_sysc_leave:
 	clc	8(8,%r12),BASED(cleanup_sysc_leave_insn)
-	je	2f
-	mvc	__LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
+	je	3f
 	clc	8(8,%r12),BASED(cleanup_sysc_leave_insn+8)
-	je	2f
-	mvc	__LC_RETURN_PSW(16),SP_PSW(%r15)
+	jhe	0f
+	mvc	__LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
+0:	mvc	__LC_RETURN_PSW(16),SP_PSW(%r15)
 	cghi	%r12,__LC_MCK_OLD_PSW
-	jne	0f
+	jne	1f
 	mvc	__LC_SAVE_AREA+64(32),SP_R12(%r15)
-	j	1f
-0:	mvc	__LC_SAVE_AREA+32(32),SP_R12(%r15)
-1:	lmg	%r0,%r11,SP_R0(%r15)
+	j	2f
+1:	mvc	__LC_SAVE_AREA+32(32),SP_R12(%r15)
+2:	lmg	%r0,%r11,SP_R0(%r15)
 	lg	%r15,SP_R15(%r15)
-2:	la	%r12,__LC_RETURN_PSW
+3:	la	%r12,__LC_RETURN_PSW
 	br	%r14
 cleanup_sysc_leave_insn:
 	.quad	sysc_done - 4
-	.quad	sysc_done - 8
+	.quad	sysc_done - 16
 
 cleanup_io_return:
 	mvc	__LC_RETURN_PSW(8),0(%r12)
@@ -1005,23 +1009,23 @@ cleanup_io_return:
 
 cleanup_io_leave:
 	clc	8(8,%r12),BASED(cleanup_io_leave_insn)
-	je	2f
-	mvc	__LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
+	je	3f
 	clc	8(8,%r12),BASED(cleanup_io_leave_insn+8)
-	je	2f
-	mvc	__LC_RETURN_PSW(16),SP_PSW(%r15)
+	jhe	0f
+	mvc	__LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
+0:	mvc	__LC_RETURN_PSW(16),SP_PSW(%r15)
 	cghi	%r12,__LC_MCK_OLD_PSW
-	jne	0f
+	jne	1f
 	mvc	__LC_SAVE_AREA+64(32),SP_R12(%r15)
-	j	1f
-0:	mvc	__LC_SAVE_AREA+32(32),SP_R12(%r15)
-1:	lmg	%r0,%r11,SP_R0(%r15)
+	j	2f
+1:	mvc	__LC_SAVE_AREA+32(32),SP_R12(%r15)
+2:	lmg	%r0,%r11,SP_R0(%r15)
 	lg	%r15,SP_R15(%r15)
-2:	la	%r12,__LC_RETURN_PSW
+3:	la	%r12,__LC_RETURN_PSW
 	br	%r14
 cleanup_io_leave_insn:
 	.quad	io_done - 4
-	.quad	io_done - 8
+	.quad	io_done - 16
 
 /*
  * Integer constants
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 3ccd36b24b8f..f9f70aa15244 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -87,6 +87,8 @@ startup_continue:
 	lg	%r12,.Lparmaddr-.LPG1(%r13)	# pointer to parameter area
 					# move IPL device to lowcore
 	mvc	__LC_IPLDEV(4),IPL_DEVICE+4-PARMAREA(%r12)
+	lghi	%r0,__LC_PASTE
+	stg	%r0,__LC_VDSO_PER_CPU
 #
 # Setup stack
 #
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 04f8c67a6101..b6110bdf8dc2 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -38,6 +38,7 @@
 #include <linux/utsname.h>
 #include <linux/tick.h>
 #include <linux/elfcore.h>
+#include <linux/kernel_stat.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -45,7 +46,6 @@
 #include <asm/processor.h>
 #include <asm/irq.h>
 #include <asm/timer.h>
-#include <asm/cpu.h>
 #include "entry.h"
 
 asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
@@ -75,36 +75,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 	return sf->gprs[8];
 }
 
-DEFINE_PER_CPU(struct s390_idle_data, s390_idle) = {
-	.lock = __SPIN_LOCK_UNLOCKED(s390_idle.lock)
-};
-
-static int s390_idle_enter(void)
-{
-	struct s390_idle_data *idle;
-
-	idle = &__get_cpu_var(s390_idle);
-	spin_lock(&idle->lock);
-	idle->idle_count++;
-	idle->in_idle = 1;
-	idle->idle_enter = get_clock();
-	spin_unlock(&idle->lock);
-	vtime_stop_cpu_timer();
-	return NOTIFY_OK;
-}
-
-void s390_idle_leave(void)
-{
-	struct s390_idle_data *idle;
-
-	vtime_start_cpu_timer();
-	idle = &__get_cpu_var(s390_idle);
-	spin_lock(&idle->lock);
-	idle->idle_time += get_clock() - idle->idle_enter;
-	idle->in_idle = 0;
-	spin_unlock(&idle->lock);
-}
-
 extern void s390_handle_mcck(void);
 /*
  * The idle loop on a S390...
@@ -117,10 +87,6 @@ static void default_idle(void)
 		local_irq_enable();
 		return;
 	}
-	if (s390_idle_enter() == NOTIFY_BAD) {
-		local_irq_enable();
-		return;
-	}
 #ifdef CONFIG_HOTPLUG_CPU
 	if (cpu_is_offline(smp_processor_id())) {
 		preempt_enable_no_resched();
@@ -130,7 +96,6 @@ static void default_idle(void)
 	local_mcck_disable();
 	if (test_thread_flag(TIF_MCCK_PENDING)) {
 		local_mcck_enable();
-		s390_idle_leave();
 		local_irq_enable();
 		s390_handle_mcck();
 		return;
@@ -138,9 +103,9 @@ static void default_idle(void)
 	trace_hardirqs_on();
 	/* Don't trace preempt off for idle. */
 	stop_critical_timings();
-	/* Wait for external, I/O or machine check interrupt. */
-	__load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT |
-			PSW_MASK_IO | PSW_MASK_EXT);
+	/* Stop virtual timer and halt the cpu. */
+	vtime_stop_cpu();
+	/* Reenable preemption tracer. */
 	start_critical_timings();
 }
 
diff --git a/arch/s390/kernel/s390_ext.c b/arch/s390/kernel/s390_ext.c
index e019b419efc6..a0d2d55d7fb3 100644
--- a/arch/s390/kernel/s390_ext.c
+++ b/arch/s390/kernel/s390_ext.c
@@ -119,8 +119,8 @@ void do_extint(struct pt_regs *regs, unsigned short code)
 	struct pt_regs *old_regs;
 
 	old_regs = set_irq_regs(regs);
-	irq_enter();
 	s390_idle_check();
+	irq_enter();
 	if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator)
 		/* Serve timer interrupts first. */
 		clock_comparator_work();
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index b7a1efd5522c..d825f4950e4e 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -427,6 +427,8 @@ setup_lowcore(void)
 		/* enable extended save area */
 		__ctl_set_bit(14, 29);
 	}
+#else
+	lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0];
 #endif
 	set_prefix((u32)(unsigned long) lc);
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 3ed5c7a83c6c..9c0ccb532a45 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -47,6 +47,7 @@
 #include <asm/lowcore.h>
 #include <asm/sclp.h>
 #include <asm/cpu.h>
+#include <asm/vdso.h>
 #include "entry.h"
 
 /*
@@ -500,6 +501,9 @@ static int __cpuinit smp_alloc_lowcore(int cpu)
 			goto out;
 		lowcore->extended_save_area_addr = (u32) save_area;
 	}
+#else
+	if (vdso_alloc_per_cpu(cpu, lowcore))
+		goto out;
 #endif
 	lowcore_ptr[cpu] = lowcore;
 	return 0;
@@ -522,6 +526,8 @@ static void smp_free_lowcore(int cpu)
 #ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE)
 		free_page((unsigned long) lowcore->extended_save_area_addr);
+#else
+	vdso_free_per_cpu(cpu, lowcore);
 #endif
 	free_page(lowcore->panic_stack - PAGE_SIZE);
 	free_pages(lowcore->async_stack - ASYNC_SIZE, ASYNC_ORDER);
@@ -664,6 +670,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	lowcore = (void *) __get_free_pages(GFP_KERNEL | GFP_DMA, lc_order);
 	panic_stack = __get_free_page(GFP_KERNEL);
 	async_stack = __get_free_pages(GFP_KERNEL, ASYNC_ORDER);
+	BUG_ON(!lowcore || !panic_stack || !async_stack);
 #ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE)
 		save_area = get_zeroed_page(GFP_KERNEL);
@@ -677,6 +684,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 #ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE)
 		lowcore->extended_save_area_addr = (u32) save_area;
+#else
+	BUG_ON(vdso_alloc_per_cpu(smp_processor_id(), lowcore));
 #endif
 	set_prefix((u32)(unsigned long) lowcore);
 	local_mcck_enable();
@@ -845,9 +854,11 @@ static ssize_t show_idle_count(struct sys_device *dev,
 	unsigned long long idle_count;
 
 	idle = &per_cpu(s390_idle, dev->id);
-	spin_lock_irq(&idle->lock);
+	spin_lock(&idle->lock);
 	idle_count = idle->idle_count;
-	spin_unlock_irq(&idle->lock);
+	if (idle->idle_enter)
+		idle_count++;
+	spin_unlock(&idle->lock);
 	return sprintf(buf, "%llu\n", idle_count);
 }
 static SYSDEV_ATTR(idle_count, 0444, show_idle_count, NULL);
@@ -856,18 +867,17 @@ static ssize_t show_idle_time(struct sys_device *dev,
 				struct sysdev_attribute *attr, char *buf)
 {
 	struct s390_idle_data *idle;
-	unsigned long long new_time;
+	unsigned long long now, idle_time, idle_enter;
 
 	idle = &per_cpu(s390_idle, dev->id);
-	spin_lock_irq(&idle->lock);
-	if (idle->in_idle) {
-		new_time = get_clock();
-		idle->idle_time += new_time - idle->idle_enter;
-		idle->idle_enter = new_time;
-	}
-	new_time = idle->idle_time;
-	spin_unlock_irq(&idle->lock);
-	return sprintf(buf, "%llu\n", new_time >> 12);
+	spin_lock(&idle->lock);
+	now = get_clock();
+	idle_time = idle->idle_time;
+	idle_enter = idle->idle_enter;
+	if (idle_enter != 0ULL && idle_enter < now)
+		idle_time += now - idle_enter;
+	spin_unlock(&idle->lock);
+	return sprintf(buf, "%llu\n", idle_time >> 12);
 }
 static SYSDEV_ATTR(idle_time_us, 0444, show_idle_time, NULL);
 
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 90e9ba11eba1..cc362c9ea8f1 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -97,6 +97,11 @@ cpumask_t cpu_coregroup_map(unsigned int cpu)
 	return mask;
 }
 
+const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
+{
+	return &cpu_core_map[cpu];
+}
+
 static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
 {
 	unsigned int cpu;
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 10a6ccef4412..25a6a82f1c02 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -31,9 +31,6 @@
 #include <asm/sections.h>
 #include <asm/vdso.h>
 
-/* Max supported size for symbol names */
-#define MAX_SYMNAME	64
-
 #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT)
 extern char vdso32_start, vdso32_end;
 static void *vdso32_kbase = &vdso32_start;
@@ -71,6 +68,119 @@ static union {
 struct vdso_data *vdso_data = &vdso_data_store.data;
 
 /*
+ * Setup vdso data page.
+ */
+static void vdso_init_data(struct vdso_data *vd)
+{
+	unsigned int facility_list;
+
+	facility_list = stfl();
+	vd->ectg_available = switch_amode && (facility_list & 1);
+}
+
+#ifdef CONFIG_64BIT
+/*
+ * Setup per cpu vdso data page.
+ */
+static void vdso_init_per_cpu_data(int cpu, struct vdso_per_cpu_data *vpcd)
+{
+}
+
+/*
+ * Allocate/free per cpu vdso data.
+ */
+#ifdef CONFIG_64BIT
+#define SEGMENT_ORDER	2
+#else
+#define SEGMENT_ORDER	1
+#endif
+
+int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore)
+{
+	unsigned long segment_table, page_table, page_frame;
+	u32 *psal, *aste;
+	int i;
+
+	lowcore->vdso_per_cpu_data = __LC_PASTE;
+
+	if (!switch_amode || !vdso_enabled)
+		return 0;
+
+	segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
+	page_table = get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	page_frame = get_zeroed_page(GFP_KERNEL);
+	if (!segment_table || !page_table || !page_frame)
+		goto out;
+
+	clear_table((unsigned long *) segment_table, _SEGMENT_ENTRY_EMPTY,
+		    PAGE_SIZE << SEGMENT_ORDER);
+	clear_table((unsigned long *) page_table, _PAGE_TYPE_EMPTY,
+		    256*sizeof(unsigned long));
+
+	*(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
+	*(unsigned long *) page_table = _PAGE_RO + page_frame;
+
+	psal = (u32 *) (page_table + 256*sizeof(unsigned long));
+	aste = psal + 32;
+
+	for (i = 4; i < 32; i += 4)
+		psal[i] = 0x80000000;
+
+	lowcore->paste[4] = (u32)(addr_t) psal;
+	psal[0] = 0x20000000;
+	psal[2] = (u32)(addr_t) aste;
+	*(unsigned long *) (aste + 2) = segment_table +
+		_ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
+	aste[4] = (u32)(addr_t) psal;
+	lowcore->vdso_per_cpu_data = page_frame;
+
+	vdso_init_per_cpu_data(cpu, (struct vdso_per_cpu_data *) page_frame);
+	return 0;
+
+out:
+	free_page(page_frame);
+	free_page(page_table);
+	free_pages(segment_table, SEGMENT_ORDER);
+	return -ENOMEM;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore)
+{
+	unsigned long segment_table, page_table, page_frame;
+	u32 *psal, *aste;
+
+	if (!switch_amode || !vdso_enabled)
+		return;
+
+	psal = (u32 *)(addr_t) lowcore->paste[4];
+	aste = (u32 *)(addr_t) psal[2];
+	segment_table = *(unsigned long *)(aste + 2) & PAGE_MASK;
+	page_table = *(unsigned long *) segment_table;
+	page_frame = *(unsigned long *) page_table;
+
+	free_page(page_frame);
+	free_page(page_table);
+	free_pages(segment_table, SEGMENT_ORDER);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static void __vdso_init_cr5(void *dummy)
+{
+	unsigned long cr5;
+
+	cr5 = offsetof(struct _lowcore, paste);
+	__ctl_load(cr5, 5, 5);
+}
+
+static void vdso_init_cr5(void)
+{
+	if (switch_amode && vdso_enabled)
+		on_each_cpu(__vdso_init_cr5, NULL, 1);
+}
+#endif /* CONFIG_64BIT */
+
+/*
  * This is called from binfmt_elf, we create the special vma for the
  * vDSO and insert it into the mm struct tree
  */
@@ -172,6 +282,9 @@ static int __init vdso_init(void)
 {
 	int i;
 
+	if (!vdso_enabled)
+		return 0;
+	vdso_init_data(vdso_data);
 #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT)
 	/* Calculate the size of the 32 bit vDSO */
 	vdso32_pages = ((&vdso32_end - &vdso32_start
@@ -208,6 +321,10 @@ static int __init vdso_init(void)
 	}
 	vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data);
 	vdso64_pagelist[vdso64_pages] = NULL;
+#ifndef CONFIG_SMP
+	BUG_ON(vdso_alloc_per_cpu(0, S390_lowcore));
+#endif
+	vdso_init_cr5();
 #endif /* CONFIG_64BIT */
 
 	get_page(virt_to_page(vdso_data));
diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S
index 488e31a3c0e7..9ce8caafdb4e 100644
--- a/arch/s390/kernel/vdso64/clock_getres.S
+++ b/arch/s390/kernel/vdso64/clock_getres.S
@@ -22,7 +22,12 @@ __kernel_clock_getres:
 	cghi	%r2,CLOCK_REALTIME
 	je	0f
 	cghi	%r2,CLOCK_MONOTONIC
+	je	0f
+	cghi	%r2,-2		/* CLOCK_THREAD_CPUTIME_ID for this thread */
 	jne	2f
+	larl	%r5,_vdso_data
+	icm	%r0,15,__LC_ECTG_OK(%r5)
+	jz	2f
 0:	ltgr	%r3,%r3
 	jz	1f				/* res == NULL */
 	larl	%r1,3f
diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S
index 738a410b7eb2..79dbfee831ec 100644
--- a/arch/s390/kernel/vdso64/clock_gettime.S
+++ b/arch/s390/kernel/vdso64/clock_gettime.S
@@ -22,8 +22,10 @@ __kernel_clock_gettime:
 	larl	%r5,_vdso_data
 	cghi	%r2,CLOCK_REALTIME
 	je	4f
+	cghi	%r2,-2		/* CLOCK_THREAD_CPUTIME_ID for this thread */
+	je	9f
 	cghi	%r2,CLOCK_MONOTONIC
-	jne	9f
+	jne	12f
 
 	/* CLOCK_MONOTONIC */
 	ltgr	%r3,%r3
@@ -42,7 +44,7 @@ __kernel_clock_gettime:
 	alg	%r0,__VDSO_WTOM_SEC(%r5)
 	clg	%r4,__VDSO_UPD_COUNT(%r5)	/* check update counter */
 	jne	0b
-	larl	%r5,10f
+	larl	%r5,13f
 1:	clg	%r1,0(%r5)
 	jl	2f
 	slg	%r1,0(%r5)
@@ -68,7 +70,7 @@ __kernel_clock_gettime:
 	lg	%r0,__VDSO_XTIME_SEC(%r5)
 	clg	%r4,__VDSO_UPD_COUNT(%r5)	/* check update counter */
 	jne	5b
-	larl	%r5,10f
+	larl	%r5,13f
 6:	clg	%r1,0(%r5)
 	jl	7f
 	slg	%r1,0(%r5)
@@ -79,11 +81,38 @@ __kernel_clock_gettime:
 8:	lghi	%r2,0
 	br	%r14
 
+	/* CLOCK_THREAD_CPUTIME_ID for this thread */
+9:	icm	%r0,15,__VDSO_ECTG_OK(%r5)
+	jz	12f
+	ear	%r2,%a4
+	llilh	%r4,0x0100
+	sar	%a4,%r4
+	lghi	%r4,0
+	sacf	512				/* Magic ectg instruction */
+	.insn	ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4
+	sacf	0
+	sar	%a4,%r2
+	algr	%r1,%r0				/* r1 = cputime as TOD value */
+	mghi	%r1,1000			/* convert to nanoseconds */
+	srlg	%r1,%r1,12			/* r1 = cputime in nanosec */
+	lgr	%r4,%r1
+	larl	%r5,13f
+	srlg	%r1,%r1,9			/* divide by 1000000000 */
+	mlg	%r0,8(%r5)
+	srlg	%r0,%r0,11			/* r0 = tv_sec */
+	stg	%r0,0(%r3)
+	msg	%r0,0(%r5)			/* calculate tv_nsec */
+	slgr	%r4,%r0				/* r4 = tv_nsec */
+	stg	%r4,8(%r3)
+	lghi	%r2,0
+	br	%r14
+
 	/* Fallback to system call */
-9:	lghi	%r1,__NR_clock_gettime
+12:	lghi	%r1,__NR_clock_gettime
 	svc	0
 	br	%r14
 
-10:	.quad	1000000000
+13:	.quad	1000000000
+14:	.quad	19342813113834067
 	.cfi_endproc
 	.size	__kernel_clock_gettime,.-__kernel_clock_gettime
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 75a6e62ea973..2fb36e462194 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -23,19 +23,43 @@
 #include <asm/s390_ext.h>
 #include <asm/timer.h>
 #include <asm/irq_regs.h>
+#include <asm/cpu.h>
 
 static ext_int_info_t ext_int_info_timer;
+
 static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer);
 
+DEFINE_PER_CPU(struct s390_idle_data, s390_idle) = {
+	.lock = __SPIN_LOCK_UNLOCKED(s390_idle.lock)
+};
+
+static inline __u64 get_vtimer(void)
+{
+	__u64 timer;
+
+	asm volatile("STPT %0" : "=m" (timer));
+	return timer;
+}
+
+static inline void set_vtimer(__u64 expires)
+{
+	__u64 timer;
+
+	asm volatile ("  STPT %0\n"  /* Store current cpu timer value */
+		      "  SPT %1"     /* Set new value immediatly afterwards */
+		      : "=m" (timer) : "m" (expires) );
+	S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer;
+	S390_lowcore.last_update_timer = expires;
+}
+
 /*
  * Update process times based on virtual cpu times stored by entry.S
  * to the lowcore fields user_timer, system_timer & steal_clock.
  */
-void account_process_tick(struct task_struct *tsk, int user_tick)
+static void do_account_vtime(struct task_struct *tsk, int hardirq_offset)
 {
-	cputime_t cputime;
-	__u64 timer, clock;
-	int rcu_user_flag;
+	struct thread_info *ti = task_thread_info(tsk);
+	__u64 timer, clock, user, system, steal;
 
 	timer = S390_lowcore.last_update_timer;
 	clock = S390_lowcore.last_update_clock;
@@ -44,50 +68,41 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
 		      : "=m" (S390_lowcore.last_update_timer),
 		        "=m" (S390_lowcore.last_update_clock) );
 	S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
-	S390_lowcore.steal_clock += S390_lowcore.last_update_clock - clock;
-
-	cputime = S390_lowcore.user_timer >> 12;
-	rcu_user_flag = cputime != 0;
-	S390_lowcore.user_timer -= cputime << 12;
-	S390_lowcore.steal_clock -= cputime << 12;
-	account_user_time(tsk, cputime);
-
-	cputime =  S390_lowcore.system_timer >> 12;
-	S390_lowcore.system_timer -= cputime << 12;
-	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, HARDIRQ_OFFSET, cputime);
-
-	cputime = S390_lowcore.steal_clock;
-	if ((__s64) cputime > 0) {
-		cputime >>= 12;
-		S390_lowcore.steal_clock -= cputime << 12;
-		account_steal_time(tsk, cputime);
+	S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock;
+
+	user = S390_lowcore.user_timer - ti->user_timer;
+	S390_lowcore.steal_timer -= user;
+	ti->user_timer = S390_lowcore.user_timer;
+	account_user_time(tsk, user, user);
+
+	system = S390_lowcore.system_timer - ti->system_timer;
+	S390_lowcore.steal_timer -= system;
+	ti->system_timer = S390_lowcore.system_timer;
+	account_system_time(tsk, hardirq_offset, system, system);
+
+	steal = S390_lowcore.steal_timer;
+	if ((s64) steal > 0) {
+		S390_lowcore.steal_timer = 0;
+		account_steal_time(steal);
 	}
 }
 
-/*
- * Update process times based on virtual cpu times stored by entry.S
- * to the lowcore fields user_timer, system_timer & steal_clock.
- */
-void account_vtime(struct task_struct *tsk)
+void account_vtime(struct task_struct *prev, struct task_struct *next)
 {
-	cputime_t cputime;
-	__u64 timer;
-
-	timer = S390_lowcore.last_update_timer;
-	asm volatile ("  STPT %0"    /* Store current cpu timer value */
-		      : "=m" (S390_lowcore.last_update_timer) );
-	S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
-
-	cputime = S390_lowcore.user_timer >> 12;
-	S390_lowcore.user_timer -= cputime << 12;
-	S390_lowcore.steal_clock -= cputime << 12;
-	account_user_time(tsk, cputime);
+	struct thread_info *ti;
+
+	do_account_vtime(prev, 0);
+	ti = task_thread_info(prev);
+	ti->user_timer = S390_lowcore.user_timer;
+	ti->system_timer = S390_lowcore.system_timer;
+	ti = task_thread_info(next);
+	S390_lowcore.user_timer = ti->user_timer;
+	S390_lowcore.system_timer = ti->system_timer;
+}
 
-	cputime =  S390_lowcore.system_timer >> 12;
-	S390_lowcore.system_timer -= cputime << 12;
-	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, 0, cputime);
+void account_process_tick(struct task_struct *tsk, int user_tick)
+{
+	do_account_vtime(tsk, HARDIRQ_OFFSET);
 }
 
 /*
@@ -96,80 +111,131 @@ void account_vtime(struct task_struct *tsk)
  */
 void account_system_vtime(struct task_struct *tsk)
 {
-	cputime_t cputime;
-	__u64 timer;
+	struct thread_info *ti = task_thread_info(tsk);
+	__u64 timer, system;
 
 	timer = S390_lowcore.last_update_timer;
-	asm volatile ("  STPT %0"    /* Store current cpu timer value */
-		      : "=m" (S390_lowcore.last_update_timer) );
+	S390_lowcore.last_update_timer = get_vtimer();
 	S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer;
 
-	cputime =  S390_lowcore.system_timer >> 12;
-	S390_lowcore.system_timer -= cputime << 12;
-	S390_lowcore.steal_clock -= cputime << 12;
-	account_system_time(tsk, 0, cputime);
+	system = S390_lowcore.system_timer - ti->system_timer;
+	S390_lowcore.steal_timer -= system;
+	ti->system_timer = S390_lowcore.system_timer;
+	account_system_time(tsk, 0, system, system);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
-static inline void set_vtimer(__u64 expires)
-{
-	__u64 timer;
-
-	asm volatile ("  STPT %0\n"  /* Store current cpu timer value */
-		      "  SPT %1"     /* Set new value immediatly afterwards */
-		      : "=m" (timer) : "m" (expires) );
-	S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer;
-	S390_lowcore.last_update_timer = expires;
-
-	/* store expire time for this CPU timer */
-	__get_cpu_var(virt_cpu_timer).to_expire = expires;
-}
-
-void vtime_start_cpu_timer(void)
+void vtime_start_cpu(void)
 {
-	struct vtimer_queue *vt_list;
-
-	vt_list = &__get_cpu_var(virt_cpu_timer);
-
-	/* CPU timer interrupt is pending, don't reprogramm it */
-	if (vt_list->idle & 1LL<<63)
-		return;
+	struct s390_idle_data *idle = &__get_cpu_var(s390_idle);
+	struct vtimer_queue *vq = &__get_cpu_var(virt_cpu_timer);
+	__u64 idle_time, expires;
+
+	/* Account time spent with enabled wait psw loaded as idle time. */
+	idle_time = S390_lowcore.int_clock - idle->idle_enter;
+	account_idle_time(idle_time);
+	S390_lowcore.last_update_clock = S390_lowcore.int_clock;
+
+	/* Account system time spent going idle. */
+	S390_lowcore.system_timer += S390_lowcore.last_update_timer - vq->idle;
+	S390_lowcore.last_update_timer = S390_lowcore.async_enter_timer;
+
+	/* Restart vtime CPU timer */
+	if (vq->do_spt) {
+		/* Program old expire value but first save progress. */
+		expires = vq->idle - S390_lowcore.async_enter_timer;
+		expires += get_vtimer();
+		set_vtimer(expires);
+	} else {
+		/* Don't account the CPU timer delta while the cpu was idle. */
+		vq->elapsed -= vq->idle - S390_lowcore.async_enter_timer;
+	}
 
-	if (!list_empty(&vt_list->list))
-		set_vtimer(vt_list->idle);
+	spin_lock(&idle->lock);
+	idle->idle_time += idle_time;
+	idle->idle_enter = 0ULL;
+	idle->idle_count++;
+	spin_unlock(&idle->lock);
 }
 
-void vtime_stop_cpu_timer(void)
+void vtime_stop_cpu(void)
 {
-	struct vtimer_queue *vt_list;
-
-	vt_list = &__get_cpu_var(virt_cpu_timer);
-
-	/* nothing to do */
-	if (list_empty(&vt_list->list)) {
-		vt_list->idle = VTIMER_MAX_SLICE;
-		goto fire;
+	struct s390_idle_data *idle = &__get_cpu_var(s390_idle);
+	struct vtimer_queue *vq = &__get_cpu_var(virt_cpu_timer);
+	psw_t psw;
+
+	/* Wait for external, I/O or machine check interrupt. */
+	psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT;
+
+	/* Check if the CPU timer needs to be reprogrammed. */
+	if (vq->do_spt) {
+		__u64 vmax = VTIMER_MAX_SLICE;
+		/*
+		 * The inline assembly is equivalent to
+		 *	vq->idle = get_cpu_timer();
+		 *	set_cpu_timer(VTIMER_MAX_SLICE);
+		 *	idle->idle_enter = get_clock();
+		 *	__load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT |
+		 *			   PSW_MASK_IO | PSW_MASK_EXT);
+		 * The difference is that the inline assembly makes sure that
+		 * the last three instruction are stpt, stck and lpsw in that
+		 * order. This is done to increase the precision.
+		 */
+		asm volatile(
+#ifndef CONFIG_64BIT
+			"	basr	1,0\n"
+			"0:	ahi	1,1f-0b\n"
+			"	st	1,4(%2)\n"
+#else /* CONFIG_64BIT */
+			"	larl	1,1f\n"
+			"	stg	1,8(%2)\n"
+#endif /* CONFIG_64BIT */
+			"	stpt	0(%4)\n"
+			"	spt	0(%5)\n"
+			"	stck	0(%3)\n"
+#ifndef CONFIG_64BIT
+			"	lpsw	0(%2)\n"
+#else /* CONFIG_64BIT */
+			"	lpswe	0(%2)\n"
+#endif /* CONFIG_64BIT */
+			"1:"
+			: "=m" (idle->idle_enter), "=m" (vq->idle)
+			: "a" (&psw), "a" (&idle->idle_enter),
+			  "a" (&vq->idle), "a" (&vmax), "m" (vmax), "m" (psw)
+			: "memory", "cc", "1");
+	} else {
+		/*
+		 * The inline assembly is equivalent to
+		 *	vq->idle = get_cpu_timer();
+		 *	idle->idle_enter = get_clock();
+		 *	__load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT |
+		 *			   PSW_MASK_IO | PSW_MASK_EXT);
+		 * The difference is that the inline assembly makes sure that
+		 * the last three instruction are stpt, stck and lpsw in that
+		 * order. This is done to increase the precision.
+		 */
+		asm volatile(
+#ifndef CONFIG_64BIT
+			"	basr	1,0\n"
+			"0:	ahi	1,1f-0b\n"
+			"	st	1,4(%2)\n"
+#else /* CONFIG_64BIT */
+			"	larl	1,1f\n"
+			"	stg	1,8(%2)\n"
+#endif /* CONFIG_64BIT */
+			"	stpt	0(%4)\n"
+			"	stck	0(%3)\n"
+#ifndef CONFIG_64BIT
+			"	lpsw	0(%2)\n"
+#else /* CONFIG_64BIT */
+			"	lpswe	0(%2)\n"
+#endif /* CONFIG_64BIT */
+			"1:"
+			: "=m" (idle->idle_enter), "=m" (vq->idle)
+			: "a" (&psw), "a" (&idle->idle_enter),
+			  "a" (&vq->idle), "m" (psw)
+			: "memory", "cc", "1");
 	}
-
-	/* store the actual expire value */
-	asm volatile ("STPT %0" : "=m" (vt_list->idle));
-
-	/*
-	 * If the CPU timer is negative we don't reprogramm
-	 * it because we will get instantly an interrupt.
-	 */
-	if (vt_list->idle & 1LL<<63)
-		return;
-
-	vt_list->offset += vt_list->to_expire - vt_list->idle;
-
-	/*
-	 * We cannot halt the CPU timer, we just write a value that
-	 * nearly never expires (only after 71 years) and re-write
-	 * the stored expire value if we continue the timer
-	 */
- fire:
-	set_vtimer(VTIMER_MAX_SLICE);
 }
 
 /*
@@ -195,30 +261,23 @@ static void list_add_sorted(struct vtimer_list *timer, struct list_head *head)
  */
 static void do_callbacks(struct list_head *cb_list)
 {
-	struct vtimer_queue *vt_list;
+	struct vtimer_queue *vq;
 	struct vtimer_list *event, *tmp;
-	void (*fn)(unsigned long);
-	unsigned long data;
 
 	if (list_empty(cb_list))
 		return;
 
-	vt_list = &__get_cpu_var(virt_cpu_timer);
+	vq = &__get_cpu_var(virt_cpu_timer);
 
 	list_for_each_entry_safe(event, tmp, cb_list, entry) {
-		fn = event->function;
-		data = event->data;
-		fn(data);
-
-		if (!event->interval)
-			/* delete one shot timer */
-			list_del_init(&event->entry);
-		else {
-			/* move interval timer back to list */
-			spin_lock(&vt_list->lock);
-			list_del_init(&event->entry);
-			list_add_sorted(event, &vt_list->list);
-			spin_unlock(&vt_list->lock);
+		list_del_init(&event->entry);
+		(event->function)(event->data);
+		if (event->interval) {
+			/* Recharge interval timer */
+			event->expires = event->interval + vq->elapsed;
+			spin_lock(&vq->lock);
+			list_add_sorted(event, &vq->list);
+			spin_unlock(&vq->lock);
 		}
 	}
 }
@@ -228,64 +287,57 @@ static void do_callbacks(struct list_head *cb_list)
  */
 static void do_cpu_timer_interrupt(__u16 error_code)
 {
-	__u64 next, delta;
-	struct vtimer_queue *vt_list;
+	struct vtimer_queue *vq;
 	struct vtimer_list *event, *tmp;
-	struct list_head *ptr;
-	/* the callback queue */
-	struct list_head cb_list;
+	struct list_head cb_list;	/* the callback queue */
+	__u64 elapsed, next;
 
 	INIT_LIST_HEAD(&cb_list);
-	vt_list = &__get_cpu_var(virt_cpu_timer);
+	vq = &__get_cpu_var(virt_cpu_timer);
 
 	/* walk timer list, fire all expired events */
-	spin_lock(&vt_list->lock);
-
-	if (vt_list->to_expire < VTIMER_MAX_SLICE)
-		vt_list->offset += vt_list->to_expire;
-
-	list_for_each_entry_safe(event, tmp, &vt_list->list, entry) {
-		if (event->expires > vt_list->offset)
-			/* found first unexpired event, leave */
-			break;
-
-		/* re-charge interval timer, we have to add the offset */
-		if (event->interval)
-			event->expires = event->interval + vt_list->offset;
-
-		/* move expired timer to the callback queue */
-		list_move_tail(&event->entry, &cb_list);
+	spin_lock(&vq->lock);
+
+	elapsed = vq->elapsed + (vq->timer - S390_lowcore.async_enter_timer);
+	BUG_ON((s64) elapsed < 0);
+	vq->elapsed = 0;
+	list_for_each_entry_safe(event, tmp, &vq->list, entry) {
+		if (event->expires < elapsed)
+			/* move expired timer to the callback queue */
+			list_move_tail(&event->entry, &cb_list);
+		else
+			event->expires -= elapsed;
 	}
-	spin_unlock(&vt_list->lock);
+	spin_unlock(&vq->lock);
+
+	vq->do_spt = list_empty(&cb_list);
 	do_callbacks(&cb_list);
 
 	/* next event is first in list */
-	spin_lock(&vt_list->lock);
-	if (!list_empty(&vt_list->list)) {
-		ptr = vt_list->list.next;
-		event = list_entry(ptr, struct vtimer_list, entry);
-		next = event->expires - vt_list->offset;
-
-		/* add the expired time from this interrupt handler
-		 * and the callback functions
-		 */
-		asm volatile ("STPT %0" : "=m" (delta));
-		delta = 0xffffffffffffffffLL - delta + 1;
-		vt_list->offset += delta;
-		next -= delta;
-	} else {
-		vt_list->offset = 0;
-		next = VTIMER_MAX_SLICE;
-	}
-	spin_unlock(&vt_list->lock);
-	set_vtimer(next);
+	next = VTIMER_MAX_SLICE;
+	spin_lock(&vq->lock);
+	if (!list_empty(&vq->list)) {
+		event = list_first_entry(&vq->list, struct vtimer_list, entry);
+		next = event->expires;
+	} else
+		vq->do_spt = 0;
+	spin_unlock(&vq->lock);
+	/*
+	 * To improve precision add the time spent by the
+	 * interrupt handler to the elapsed time.
+	 * Note: CPU timer counts down and we got an interrupt,
+	 *	 the current content is negative
+	 */
+	elapsed = S390_lowcore.async_enter_timer - get_vtimer();
+	set_vtimer(next - elapsed);
+	vq->timer = next - elapsed;
+	vq->elapsed = elapsed;
 }
 
 void init_virt_timer(struct vtimer_list *timer)
 {
 	timer->function = NULL;
 	INIT_LIST_HEAD(&timer->entry);
-	spin_lock_init(&timer->lock);
 }
 EXPORT_SYMBOL(init_virt_timer);
 
@@ -299,44 +351,40 @@ static inline int vtimer_pending(struct vtimer_list *timer)
  */
 static void internal_add_vtimer(struct vtimer_list *timer)
 {
+	struct vtimer_queue *vq;
 	unsigned long flags;
-	__u64 done;
-	struct vtimer_list *event;
-	struct vtimer_queue *vt_list;
+	__u64 left, expires;
 
-	vt_list = &per_cpu(virt_cpu_timer, timer->cpu);
-	spin_lock_irqsave(&vt_list->lock, flags);
+	vq = &per_cpu(virt_cpu_timer, timer->cpu);
+	spin_lock_irqsave(&vq->lock, flags);
 
 	BUG_ON(timer->cpu != smp_processor_id());
 
-	/* if list is empty we only have to set the timer */
-	if (list_empty(&vt_list->list)) {
-		/* reset the offset, this may happen if the last timer was
-		 * just deleted by mod_virt_timer and the interrupt
-		 * didn't happen until here
-		 */
-		vt_list->offset = 0;
-		goto fire;
+	if (list_empty(&vq->list)) {
+		/* First timer on this cpu, just program it. */
+		list_add(&timer->entry, &vq->list);
+		set_vtimer(timer->expires);
+		vq->timer = timer->expires;
+		vq->elapsed = 0;
+	} else {
+		/* Check progress of old timers. */
+		expires = timer->expires;
+		left = get_vtimer();
+		if (likely((s64) expires < (s64) left)) {
+			/* The new timer expires before the current timer. */
+			set_vtimer(expires);
+			vq->elapsed += vq->timer - left;
+			vq->timer = expires;
+		} else {
+			vq->elapsed += vq->timer - left;
+			vq->timer = left;
+		}
+		/* Insert new timer into per cpu list. */
+		timer->expires += vq->elapsed;
+		list_add_sorted(timer, &vq->list);
 	}
 
-	/* save progress */
-	asm volatile ("STPT %0" : "=m" (done));
-
-	/* calculate completed work */
-	done = vt_list->to_expire - done + vt_list->offset;
-	vt_list->offset = 0;
-
-	list_for_each_entry(event, &vt_list->list, entry)
-		event->expires -= done;
-
- fire:
-	list_add_sorted(timer, &vt_list->list);
-
-	/* get first element, which is the next vtimer slice */
-	event = list_entry(vt_list->list.next, struct vtimer_list, entry);
-
-	set_vtimer(event->expires);
-	spin_unlock_irqrestore(&vt_list->lock, flags);
+	spin_unlock_irqrestore(&vq->lock, flags);
 	/* release CPU acquired in prepare_vtimer or mod_virt_timer() */
 	put_cpu();
 }
@@ -381,14 +429,15 @@ EXPORT_SYMBOL(add_virt_timer_periodic);
  * If we change a pending timer the function must be called on the CPU
  * where the timer is running on, e.g. by smp_call_function_single()
  *
- * The original mod_timer adds the timer if it is not pending. For compatibility
- * we do the same. The timer will be added on the current CPU as a oneshot timer.
+ * The original mod_timer adds the timer if it is not pending. For
+ * compatibility we do the same. The timer will be added on the current
+ * CPU as a oneshot timer.
  *
  * returns whether it has modified a pending timer (1) or not (0)
  */
 int mod_virt_timer(struct vtimer_list *timer, __u64 expires)
 {
-	struct vtimer_queue *vt_list;
+	struct vtimer_queue *vq;
 	unsigned long flags;
 	int cpu;
 
@@ -404,17 +453,17 @@ int mod_virt_timer(struct vtimer_list *timer, __u64 expires)
 		return 1;
 
 	cpu = get_cpu();
-	vt_list = &per_cpu(virt_cpu_timer, cpu);
+	vq = &per_cpu(virt_cpu_timer, cpu);
 
 	/* check if we run on the right CPU */
 	BUG_ON(timer->cpu != cpu);
 
 	/* disable interrupts before test if timer is pending */
-	spin_lock_irqsave(&vt_list->lock, flags);
+	spin_lock_irqsave(&vq->lock, flags);
 
 	/* if timer isn't pending add it on the current CPU */
 	if (!vtimer_pending(timer)) {
-		spin_unlock_irqrestore(&vt_list->lock, flags);
+		spin_unlock_irqrestore(&vq->lock, flags);
 		/* we do not activate an interval timer with mod_virt_timer */
 		timer->interval = 0;
 		timer->expires = expires;
@@ -431,7 +480,7 @@ int mod_virt_timer(struct vtimer_list *timer, __u64 expires)
 		timer->interval = expires;
 
 	/* the timer can't expire anymore so we can release the lock */
-	spin_unlock_irqrestore(&vt_list->lock, flags);
+	spin_unlock_irqrestore(&vq->lock, flags);
 	internal_add_vtimer(timer);
 	return 1;
 }
@@ -445,25 +494,19 @@ EXPORT_SYMBOL(mod_virt_timer);
 int del_virt_timer(struct vtimer_list *timer)
 {
 	unsigned long flags;
-	struct vtimer_queue *vt_list;
+	struct vtimer_queue *vq;
 
 	/* check if timer is pending */
 	if (!vtimer_pending(timer))
 		return 0;
 
-	vt_list = &per_cpu(virt_cpu_timer, timer->cpu);
-	spin_lock_irqsave(&vt_list->lock, flags);
+	vq = &per_cpu(virt_cpu_timer, timer->cpu);
+	spin_lock_irqsave(&vq->lock, flags);
 
 	/* we don't interrupt a running timer, just let it expire! */
 	list_del_init(&timer->entry);
 
-	/* last timer removed */
-	if (list_empty(&vt_list->list)) {
-		vt_list->to_expire = 0;
-		vt_list->offset = 0;
-	}
-
-	spin_unlock_irqrestore(&vt_list->lock, flags);
+	spin_unlock_irqrestore(&vq->lock, flags);
 	return 1;
 }
 EXPORT_SYMBOL(del_virt_timer);
@@ -473,24 +516,19 @@ EXPORT_SYMBOL(del_virt_timer);
  */
 void init_cpu_vtimer(void)
 {
-	struct vtimer_queue *vt_list;
+	struct vtimer_queue *vq;
 
 	/* kick the virtual timer */
-	S390_lowcore.exit_timer = VTIMER_MAX_SLICE;
-	S390_lowcore.last_update_timer = VTIMER_MAX_SLICE;
-	asm volatile ("SPT %0" : : "m" (S390_lowcore.last_update_timer));
 	asm volatile ("STCK %0" : "=m" (S390_lowcore.last_update_clock));
+	asm volatile ("STPT %0" : "=m" (S390_lowcore.last_update_timer));
+
+	/* initialize per cpu vtimer structure */
+	vq = &__get_cpu_var(virt_cpu_timer);
+	INIT_LIST_HEAD(&vq->list);
+	spin_lock_init(&vq->lock);
 
 	/* enable cpu timer interrupts */
 	__ctl_set_bit(0,10);
-
-	vt_list = &__get_cpu_var(virt_cpu_timer);
-	INIT_LIST_HEAD(&vt_list->list);
-	spin_lock_init(&vt_list->lock);
-	vt_list->to_expire = 0;
-	vt_list->offset = 0;
-	vt_list->idle = 0;
-
 }
 
 void __init vtime_init(void)
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 279d9cc4a007..066f0fba590e 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -32,6 +32,7 @@
 #define parent_node(node)	((void)(node),0)
 
 #define node_to_cpumask(node)	((void)node, cpu_online_map)
+#define cpumask_of_node(node)	((void)node, cpu_online_mask)
 #define node_to_first_cpu(node)	((void)(node),0)
 
 #define pcibus_to_node(bus)	((void)(bus), -1)
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index 001c04027c82..b8a65b64e1df 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -16,8 +16,12 @@ static inline cpumask_t node_to_cpumask(int node)
 {
 	return numa_cpumask_lookup_table[node];
 }
+#define cpumask_of_node(node) (&numa_cpumask_lookup_table[node])
 
-/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+/*
+ * Returns a pointer to the cpumask of CPUs on Node 'node'.
+ * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+ */
 #define node_to_cpumask_ptr(v, node)		\
 		cpumask_t *v = &(numa_cpumask_lookup_table[node])
 
@@ -26,9 +30,7 @@ static inline cpumask_t node_to_cpumask(int node)
 
 static inline int node_to_first_cpu(int node)
 {
-	cpumask_t tmp;
-	tmp = node_to_cpumask(node);
-	return first_cpu(tmp);
+	return cpumask_first(cpumask_of_node(node));
 }
 
 struct pci_bus;
@@ -77,10 +79,13 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 #define topology_core_id(cpu)			(cpu_data(cpu).core_id)
 #define topology_core_siblings(cpu)		(cpu_core_map[cpu])
 #define topology_thread_siblings(cpu)		(per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
+#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #define mc_capable()				(sparc64_multi_core)
 #define smt_capable()				(sparc64_multi_core)
 #endif /* CONFIG_SMP */
 
 #define cpu_coregroup_map(cpu)			(cpu_core_map[cpu])
+#define cpu_coregroup_mask(cpu)			(&cpu_core_map[cpu])
 
 #endif /* _ASM_SPARC64_TOPOLOGY_H */
diff --git a/arch/sparc/kernel/of_device_64.c b/arch/sparc/kernel/of_device_64.c
index 322046cdf85f..4873f28905b0 100644
--- a/arch/sparc/kernel/of_device_64.c
+++ b/arch/sparc/kernel/of_device_64.c
@@ -778,7 +778,7 @@ static unsigned int __init build_one_device_irq(struct of_device *op,
 out:
 	nid = of_node_to_nid(dp);
 	if (nid != -1) {
-		cpumask_t numa_mask = node_to_cpumask(nid);
+		cpumask_t numa_mask = *cpumask_of_node(nid);
 
 		irq_set_affinity(irq, &numa_mask);
 	}
diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c
index 0d0cd815e83e..4ef282e81912 100644
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@ -286,7 +286,7 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm,
 
 	nid = pbm->numa_node;
 	if (nid != -1) {
-		cpumask_t numa_mask = node_to_cpumask(nid);
+		cpumask_t numa_mask = *cpumask_of_node(nid);
 
 		irq_set_affinity(irq, &numa_mask);
 	}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 249d1e0824b5..862adb9bf0d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -586,6 +586,16 @@ config AMD_IOMMU
 	  your BIOS for an option to enable it or if you have an IVRS ACPI
 	  table.
 
+config AMD_IOMMU_STATS
+	bool "Export AMD IOMMU statistics to debugfs"
+	depends on AMD_IOMMU
+	select DEBUG_FS
+	help
+	  This option enables code in the AMD IOMMU driver to collect various
+	  statistics about whats happening in the driver and exports that
+	  information to userspace via debugfs.
+	  If unsure, say N.
+
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	def_bool y if X86_64
@@ -599,6 +609,9 @@ config SWIOTLB
 config IOMMU_HELPER
 	def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
 
+config IOMMU_API
+	def_bool (AMD_IOMMU || DMAR)
+
 config MAXSMP
 	bool "Configure Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index b195f85526e3..9dabd00e9805 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,15 +24,14 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
-#include <asm/ia32.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
 #include <asm/user32.h>
 #include <asm/sigcontext32.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
-
 #include <asm/sigframe.h>
+#include <asm/sys_ia32.h>
 
 #define DEBUG_SIG 0
 
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index d21991ce606c..29cdcd02ead3 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -8,6 +8,7 @@
 #include <linux/shm.h>
 #include <linux/ipc.h>
 #include <linux/compat.h>
+#include <asm/sys_ia32.h>
 
 asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
 			  compat_uptr_t ptr, u32 fifth)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 2e09dcd3c0a6..6c0d7f6231af 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -44,8 +44,8 @@
 #include <asm/types.h>
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
-#include <asm/ia32.h>
 #include <asm/vgtod.h>
+#include <asm/sys_ia32.h>
 
 #define AA(__x)		((unsigned long)(__x))
 
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ac302a2fa339..95c8cd9d22b5 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -190,16 +190,23 @@
 /* FIXME: move this macro to <linux/pci.h> */
 #define PCI_BUS(x) (((x) >> 8) & 0xff)
 
+/* Protection domain flags */
+#define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
+#define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
+					      domain for an IOMMU */
+
 /*
  * This structure contains generic data for  IOMMU protection domains
  * independent of their use.
  */
 struct protection_domain {
-	spinlock_t lock; /* mostly used to lock the page table*/
-	u16 id;		 /* the domain id written to the device table */
-	int mode;	 /* paging mode (0-6 levels) */
-	u64 *pt_root;	 /* page table root pointer */
-	void *priv;	 /* private data */
+	spinlock_t lock;	/* mostly used to lock the page table*/
+	u16 id;			/* the domain id written to the device table */
+	int mode;		/* paging mode (0-6 levels) */
+	u64 *pt_root;		/* page table root pointer */
+	unsigned long flags;	/* flags to find out type of domain */
+	unsigned dev_cnt;	/* devices assigned to this domain */
+	void *priv;		/* private data */
 };
 
 /*
@@ -295,7 +302,7 @@ struct amd_iommu {
 	bool int_enabled;
 
 	/* if one, we need to send a completion wait command */
-	int need_sync;
+	bool need_sync;
 
 	/* default dma_ops domain for that IOMMU */
 	struct dma_ops_domain *default_dom;
@@ -374,7 +381,7 @@ extern struct protection_domain **amd_iommu_pd_table;
 extern unsigned long *amd_iommu_pd_alloc_bitmap;
 
 /* will be 1 if device isolation is enabled */
-extern int amd_iommu_isolate;
+extern bool amd_iommu_isolate;
 
 /*
  * If true, the addresses will be flushed on unmap time, not when
@@ -382,18 +389,6 @@ extern int amd_iommu_isolate;
  */
 extern bool amd_iommu_unmap_flush;
 
-/* takes a PCI device id and prints it out in a readable form */
-static inline void print_devid(u16 devid, int nl)
-{
-	int bus = devid >> 8;
-	int dev = devid >> 3 & 0x1f;
-	int fn  = devid & 0x07;
-
-	printk("%02x:%02x.%x", bus, dev, fn);
-	if (nl)
-		printk("\n");
-}
-
 /* takes bus and device/function and returns the device id
  * FIXME: should that be in generic PCI code? */
 static inline u16 calc_devid(u8 bus, u8 devfn)
@@ -401,4 +396,32 @@ static inline u16 calc_devid(u8 bus, u8 devfn)
 	return (((u16)bus) << 8) | devfn;
 }
 
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+struct __iommu_counter {
+	char *name;
+	struct dentry *dent;
+	u64 value;
+};
+
+#define DECLARE_STATS_COUNTER(nm) \
+	static struct __iommu_counter nm = {	\
+		.name = #nm,			\
+	}
+
+#define INC_STATS_COUNTER(name)		name.value += 1
+#define ADD_STATS_COUNTER(name, x)	name.value += (x)
+#define SUB_STATS_COUNTER(name, x)	name.value -= (x)
+
+#else /* CONFIG_AMD_IOMMU_STATS */
+
+#define DECLARE_STATS_COUNTER(name)
+#define INC_STATS_COUNTER(name)
+#define ADD_STATS_COUNTER(name, x)
+#define SUB_STATS_COUNTER(name, x)
+
+static inline void amd_iommu_stats_init(void) { }
+
+#endif /* CONFIG_AMD_IOMMU_STATS */
+
 #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 25caa0738af5..ab1d51a8855e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -54,7 +54,6 @@ extern int disable_apic;
 extern int is_vsmp_box(void);
 extern void xapic_wait_icr_idle(void);
 extern u32 safe_xapic_wait_icr_idle(void);
-extern u64 xapic_icr_read(void);
 extern void xapic_icr_write(u32, u32);
 extern int setup_profiling_timer(unsigned int);
 
@@ -93,7 +92,7 @@ static inline u32 native_apic_msr_read(u32 reg)
 }
 
 #ifndef CONFIG_X86_32
-extern int x2apic, x2apic_preenabled;
+extern int x2apic;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void enable_IR_x2apic(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index a2e545c91c35..ca5ffb2856b6 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
 
 #endif /* CONFIG_X86_32 */
 
+extern int add_efi_memmap;
 extern void efi_reserve_early(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
index 51ac1230294e..bc53d5ef1386 100644
--- a/arch/x86/include/asm/es7000/apic.h
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -157,7 +157,7 @@ cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
 
 	num_bits_set = cpumask_weight(cpumask);
 	/* Return id to all */
-	if (num_bits_set == NR_CPUS)
+	if (num_bits_set == nr_cpu_ids)
 		return 0xFF;
 	/*
 	 * The cpus in the mask must all be on the apic cluster.  If are not
@@ -190,7 +190,7 @@ static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 
 	num_bits_set = cpus_weight(*cpumask);
 	/* Return id to all */
-	if (num_bits_set == NR_CPUS)
+	if (num_bits_set == nr_cpu_ids)
 		return cpu_to_logical_apicid(0);
 	/*
 	 * The cpus in the mask must all be on the apic cluster.  If are not
@@ -218,9 +218,6 @@ static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
 						  const struct cpumask *andmask)
 {
-	int num_bits_set;
-	int cpus_found = 0;
-	int cpu;
 	int apicid = cpu_to_logical_apicid(0);
 	cpumask_var_t cpumask;
 
@@ -229,31 +226,8 @@ static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
 
 	cpumask_and(cpumask, inmask, andmask);
 	cpumask_and(cpumask, cpumask, cpu_online_mask);
+	apicid = cpu_mask_to_apicid(cpumask);
 
-	num_bits_set = cpumask_weight(cpumask);
-	/* Return id to all */
-	if (num_bits_set == NR_CPUS)
-		goto exit;
-	/*
-	 * The cpus in the mask must all be on the apic cluster.  If are not
-	 * on the same apicid cluster return default value of TARGET_CPUS.
-	 */
-	cpu = cpumask_first(cpumask);
-	apicid = cpu_to_logical_apicid(cpu);
-	while (cpus_found < num_bits_set) {
-		if (cpumask_test_cpu(cpu, cpumask)) {
-			int new_apicid = cpu_to_logical_apicid(cpu);
-			if (apicid_cluster(apicid) !=
-					apicid_cluster(new_apicid)){
-				printk ("%s: Not a valid mask!\n", __func__);
-				return cpu_to_logical_apicid(0);
-			}
-			apicid = new_apicid;
-			cpus_found++;
-		}
-		cpu++;
-	}
-exit:
 	free_cpumask_var(cpumask);
 	return apicid;
 }
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 97215a458e5f..730843d1d2fb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -360,7 +360,7 @@ struct kvm_arch{
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
 	struct list_head oos_global_pages;
-	struct dmar_domain *intel_iommu_domain;
+	struct iommu_domain *iommu_domain;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index d28a507cef39..1caf57628b9c 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -15,7 +15,7 @@
 #define SHARED_SWITCHER_PAGES \
 	DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
 /* Pages for switcher itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
+#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
 
 /* We map at -4M for ease of mapping into the guest (one PTE page). */
 #define SWITCHER_ADDR 0xFFC00000
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 91885c28f66b..62d14ce3cd00 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,13 +6,13 @@
 #include <asm/mpspec_def.h>
 
 extern int apic_version[MAX_APICS];
+extern int pic_mode;
 
 #ifdef CONFIG_X86_32
 #include <mach_mpspec.h>
 
 extern unsigned int def_to_bigsmp;
 extern u8 apicid_2_node[];
-extern int pic_mode;
 
 #ifdef CONFIG_X86_NUMAQ
 extern int mp_bus_id_to_node[MAX_MP_BUSSES];
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
index c80f00d29965..bf37bc49bd8e 100644
--- a/arch/x86/include/asm/numaq/apic.h
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -63,8 +63,8 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
 extern u8 cpu_2_logical_apicid[];
 static inline int cpu_to_logical_apicid(int cpu)
 {
-       if (cpu >= NR_CPUS)
-	       return BAD_APICID;
+	if (cpu >= nr_cpu_ids)
+		return BAD_APICID;
 	return (int)cpu_2_logical_apicid[cpu];
 }
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 66834c41c049..a977de23cb4d 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -102,9 +102,9 @@ extern void pci_iommu_alloc(void);
 
 #ifdef CONFIG_NUMA
 /* Returns the node based on pci bus */
-static inline int __pcibus_to_node(struct pci_bus *bus)
+static inline int __pcibus_to_node(const struct pci_bus *bus)
 {
-	struct pci_sysdata *sd = bus->sysdata;
+	const struct pci_sysdata *sd = bus->sysdata;
 
 	return sd->node;
 }
@@ -113,6 +113,12 @@ static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
 {
 	return node_to_cpumask(__pcibus_to_node(bus));
 }
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+	return cpumask_of_node(__pcibus_to_node(bus));
+}
 #endif
 
 #endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/pci/pci.h b/arch/x86/include/asm/pci_x86.h
index 1959018aac02..e60fd3e14bdf 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -57,7 +57,8 @@ extern struct pci_ops pci_root_ops;
 struct irq_info {
 	u8 bus, devfn;			/* Bus, device and function */
 	struct {
-		u8 link;		/* IRQ line ID, chipset dependent, 0=not routed */
+		u8 link;		/* IRQ line ID, chipset dependent,
+					   0 = not routed */
 		u16 bitmap;		/* Available IRQs */
 	} __attribute__((packed)) irq[4];
 	u8 slot;			/* Slot number, 0=onboard */
@@ -69,11 +70,13 @@ struct irq_routing_table {
 	u16 version;			/* PIRQ_VERSION */
 	u16 size;			/* Table size in bytes */
 	u8 rtr_bus, rtr_devfn;		/* Where the interrupt router lies */
-	u16 exclusive_irqs;		/* IRQs devoted exclusively to PCI usage */
-	u16 rtr_vendor, rtr_device;	/* Vendor and device ID of interrupt router */
+	u16 exclusive_irqs;		/* IRQs devoted exclusively to
+					   PCI usage */
+	u16 rtr_vendor, rtr_device;	/* Vendor and device ID of
+					   interrupt router */
 	u32 miniport_data;		/* Crap */
 	u8 rfu[11];
-	u8 checksum;			/* Modulo 256 checksum must give zero */
+	u8 checksum;			/* Modulo 256 checksum must give 0 */
 	struct irq_info slots[0];
 } __attribute__((packed));
 
@@ -148,15 +151,15 @@ static inline unsigned int mmio_config_readl(void __iomem *pos)
 
 static inline void mmio_config_writeb(void __iomem *pos, u8 val)
 {
-	asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory");
+	asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory");
 }
 
 static inline void mmio_config_writew(void __iomem *pos, u16 val)
 {
-	asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory");
+	asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory");
 }
 
 static inline void mmio_config_writel(void __iomem *pos, u32 val)
 {
-	asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory");
+	asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
 }
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
index 99327d1be49f..4bb5fb34f030 100644
--- a/arch/x86/include/asm/summit/apic.h
+++ b/arch/x86/include/asm/summit/apic.h
@@ -52,7 +52,7 @@ static inline void init_apic_ldr(void)
 	int i;
 
 	/* Create logical APIC IDs by counting CPUs already in cluster. */
-	for (count = 0, i = NR_CPUS; --i >= 0; ) {
+	for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
 		lid = cpu_2_logical_apicid[i];
 		if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
 			++count;
@@ -97,8 +97,8 @@ static inline int apicid_to_node(int logical_apicid)
 static inline int cpu_to_logical_apicid(int cpu)
 {
 #ifdef CONFIG_SMP
-       if (cpu >= NR_CPUS)
-	       return BAD_APICID;
+	if (cpu >= nr_cpu_ids)
+		return BAD_APICID;
 	return (int)cpu_2_logical_apicid[cpu];
 #else
 	return logical_smp_processor_id();
@@ -107,7 +107,7 @@ static inline int cpu_to_logical_apicid(int cpu)
 
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-	if (mps_cpu < NR_CPUS)
+	if (mps_cpu < nr_cpu_ids)
 		return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
 	else
 		return BAD_APICID;
@@ -146,7 +146,7 @@ static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 
 	num_bits_set = cpus_weight(*cpumask);
 	/* Return id to all */
-	if (num_bits_set == NR_CPUS)
+	if (num_bits_set >= nr_cpu_ids)
 		return (int) 0xFF;
 	/*
 	 * The cpus in the mask must all be on the apic cluster.  If are not
@@ -173,42 +173,16 @@ static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
 						  const struct cpumask *andmask)
 {
-	int num_bits_set;
-	int cpus_found = 0;
-	int cpu;
-	int apicid = 0xFF;
+	int apicid = cpu_to_logical_apicid(0);
 	cpumask_var_t cpumask;
 
 	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
-		return (int) 0xFF;
+		return apicid;
 
 	cpumask_and(cpumask, inmask, andmask);
 	cpumask_and(cpumask, cpumask, cpu_online_mask);
+	apicid = cpu_mask_to_apicid(cpumask);
 
-	num_bits_set = cpumask_weight(cpumask);
-	/* Return id to all */
-	if (num_bits_set == nr_cpu_ids)
-		goto exit;
-	/*
-	 * The cpus in the mask must all be on the apic cluster.  If are not
-	 * on the same apicid cluster return default value of TARGET_CPUS.
-	 */
-	cpu = cpumask_first(cpumask);
-	apicid = cpu_to_logical_apicid(cpu);
-	while (cpus_found < num_bits_set) {
-		if (cpumask_test_cpu(cpu, cpumask)) {
-			int new_apicid = cpu_to_logical_apicid(cpu);
-			if (apicid_cluster(apicid) !=
-					apicid_cluster(new_apicid)){
-				printk ("%s: Not a valid mask!\n", __func__);
-				return 0xFF;
-			}
-			apicid = apicid | new_apicid;
-			cpus_found++;
-		}
-		cpu++;
-	}
-exit:
 	free_cpumask_var(cpumask);
 	return apicid;
 }
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
new file mode 100644
index 000000000000..ffb08be2a530
--- /dev/null
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -0,0 +1,101 @@
+/*
+ * sys_ia32.h - Linux ia32 syscall interfaces
+ *
+ * Copyright (c) 2008 Jaswinder Singh Rajput
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef _ASM_X86_SYS_IA32_H
+#define _ASM_X86_SYS_IA32_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/signal.h>
+#include <asm/compat.h>
+#include <asm/ia32.h>
+
+/* ia32/sys_ia32.c */
+asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
+
+asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
+asmlinkage long sys32_fstatat(unsigned int, char __user *,
+			      struct stat64 __user *, int);
+struct mmap_arg_struct;
+asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
+asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
+
+asmlinkage long sys32_pipe(int __user *);
+struct sigaction32;
+struct old_sigaction32;
+asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
+				   struct sigaction32 __user *, unsigned int);
+asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
+				struct old_sigaction32 __user *);
+asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
+				     compat_sigset_t __user *, unsigned int);
+asmlinkage long sys32_alarm(unsigned int);
+
+struct sel_arg_struct;
+asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
+asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
+asmlinkage long sys32_sysfs(int, u32, u32);
+
+asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
+					    struct compat_timespec __user *);
+asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
+asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+struct sysctl_ia32;
+asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
+#endif
+
+asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+
+asmlinkage long sys32_personality(unsigned long);
+asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
+
+asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
+			    unsigned long, unsigned long, unsigned long);
+
+struct oldold_utsname;
+struct old_utsname;
+asmlinkage long sys32_olduname(struct oldold_utsname __user *);
+long sys32_uname(struct old_utsname __user *);
+
+long sys32_ustat(unsigned, struct ustat32 __user *);
+
+asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+			     compat_uptr_t __user *, struct pt_regs *);
+asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
+
+long sys32_lseek(unsigned int, int, unsigned int);
+long sys32_kill(int, int);
+long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
+long sys32_vm86_warning(void);
+long sys32_lookup_dcookie(u32, u32, char __user *, size_t);
+
+asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
+asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
+				      unsigned, unsigned, int);
+asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int);
+asmlinkage long sys32_fallocate(int, int, unsigned,
+				unsigned, unsigned, unsigned);
+
+/* ia32/ia32_signal.c */
+asmlinkage long sys32_sigsuspend(int, int, old_sigset_t);
+asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *,
+				  stack_ia32_t __user *, struct pt_regs *);
+asmlinkage long sys32_sigreturn(struct pt_regs *);
+asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
+
+/* ia32/ipc32.c */
+asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 79e31e9dcdda..4e2f2e0aab27 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -61,13 +61,19 @@ static inline int cpu_to_node(int cpu)
  *
  * Side note: this function creates the returned cpumask on the stack
  * so with a high NR_CPUS count, excessive stack space is used.  The
- * node_to_cpumask_ptr function should be used whenever possible.
+ * cpumask_of_node function should be used whenever possible.
  */
 static inline cpumask_t node_to_cpumask(int node)
 {
 	return node_to_cpumask_map[node];
 }
 
+/* Returns a bitmask of CPUs on Node 'node'. */
+static inline const struct cpumask *cpumask_of_node(int node)
+{
+	return &node_to_cpumask_map[node];
+}
+
 #else /* CONFIG_X86_64 */
 
 /* Mappings between node number and cpus on that node. */
@@ -82,7 +88,7 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 extern int cpu_to_node(int cpu);
 extern int early_cpu_to_node(int cpu);
-extern const cpumask_t *_node_to_cpumask_ptr(int node);
+extern const cpumask_t *cpumask_of_node(int node);
 extern cpumask_t node_to_cpumask(int node);
 
 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
@@ -103,7 +109,7 @@ static inline int early_cpu_to_node(int cpu)
 }
 
 /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+static inline const cpumask_t *cpumask_of_node(int node)
 {
 	return &node_to_cpumask_map[node];
 }
@@ -116,12 +122,15 @@ static inline cpumask_t node_to_cpumask(int node)
 
 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-/* Replace default node_to_cpumask_ptr with optimized version */
+/*
+ * Replace default node_to_cpumask_ptr with optimized version
+ * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+ */
 #define node_to_cpumask_ptr(v, node)		\
-		const cpumask_t *v = _node_to_cpumask_ptr(node)
+		const cpumask_t *v = cpumask_of_node(node)
 
 #define node_to_cpumask_ptr_next(v, node)	\
-			   v = _node_to_cpumask_ptr(node)
+			   v = cpumask_of_node(node)
 
 #endif /* CONFIG_X86_64 */
 
@@ -187,7 +196,7 @@ extern int __node_distance(int, int);
 #define	cpu_to_node(cpu)	0
 #define	early_cpu_to_node(cpu)	0
 
-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+static inline const cpumask_t *cpumask_of_node(int node)
 {
 	return &cpu_online_map;
 }
@@ -200,12 +209,15 @@ static inline int node_to_first_cpu(int node)
 	return first_cpu(cpu_online_map);
 }
 
-/* Replace default node_to_cpumask_ptr with optimized version */
+/*
+ * Replace default node_to_cpumask_ptr with optimized version
+ * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+ */
 #define node_to_cpumask_ptr(v, node)		\
-		const cpumask_t *v = _node_to_cpumask_ptr(node)
+		const cpumask_t *v = cpumask_of_node(node)
 
 #define node_to_cpumask_ptr_next(v, node)	\
-			   v = _node_to_cpumask_ptr(node)
+			   v = cpumask_of_node(node)
 #endif
 
 #include <asm-generic/topology.h>
@@ -214,12 +226,12 @@ static inline int node_to_first_cpu(int node)
 /* Returns the number of the first CPU on Node 'node'. */
 static inline int node_to_first_cpu(int node)
 {
-	node_to_cpumask_ptr(mask, node);
-	return first_cpu(*mask);
+	return cpumask_first(cpumask_of_node(node));
 }
 #endif
 
 extern cpumask_t cpu_coregroup_map(int cpu);
+extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index e2363253bbbf..50423c7b56b2 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -133,61 +133,61 @@ struct bau_msg_payload {
  * see table 4.2.3.0.1 in broacast_assist spec.
  */
 struct bau_msg_header {
-	int dest_subnodeid:6;	/* must be zero */
+	unsigned int dest_subnodeid:6;	/* must be zero */
 	/* bits 5:0 */
-	int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
-	/* bits 20:6 */
-	int command:8;		/* message type */
+	unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
+	/* bits 20:6 */			  /* first bit in node_map */
+	unsigned int command:8;	/* message type */
 	/* bits 28:21 */
 				/* 0x38: SN3net EndPoint Message */
-	int rsvd_1:3;		/* must be zero */
+	unsigned int rsvd_1:3;	/* must be zero */
 	/* bits 31:29 */
 				/* int will align on 32 bits */
-	int rsvd_2:9;		/* must be zero */
+	unsigned int rsvd_2:9;	/* must be zero */
 	/* bits 40:32 */
 				/* Suppl_A is 56-41 */
-	int payload_2a:8;	/* becomes byte 16 of msg */
+	unsigned int payload_2a:8;/* becomes byte 16 of msg */
 	/* bits 48:41 */	/* not currently using */
-	int payload_2b:8;	/* becomes byte 17 of msg */
+	unsigned int payload_2b:8;/* becomes byte 17 of msg */
 	/* bits 56:49 */	/* not currently using */
 				/* Address field (96:57) is never used as an
 				   address (these are address bits 42:3) */
-	int rsvd_3:1;		/* must be zero */
+	unsigned int rsvd_3:1;	/* must be zero */
 	/* bit 57 */
 				/* address bits 27:4 are payload */
 				/* these 24 bits become bytes 12-14 of msg */
-	int replied_to:1;	/* sent as 0 by the source to byte 12 */
+	unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
 	/* bit 58 */
 
-	int payload_1a:5;	/* not currently used */
+	unsigned int payload_1a:5;/* not currently used */
 	/* bits 63:59 */
-	int payload_1b:8;	/* not currently used */
+	unsigned int payload_1b:8;/* not currently used */
 	/* bits 71:64 */
-	int payload_1c:8;	/* not currently used */
+	unsigned int payload_1c:8;/* not currently used */
 	/* bits 79:72 */
-	int payload_1d:2;	/* not currently used */
+	unsigned int payload_1d:2;/* not currently used */
 	/* bits 81:80 */
 
-	int rsvd_4:7;		/* must be zero */
+	unsigned int rsvd_4:7;	/* must be zero */
 	/* bits 88:82 */
-	int sw_ack_flag:1;	/* software acknowledge flag */
+	unsigned int sw_ack_flag:1;/* software acknowledge flag */
 	/* bit 89 */
 				/* INTD trasactions at destination are to
 				   wait for software acknowledge */
-	int rsvd_5:6;		/* must be zero */
+	unsigned int rsvd_5:6;	/* must be zero */
 	/* bits 95:90 */
-	int rsvd_6:5;		/* must be zero */
+	unsigned int rsvd_6:5;	/* must be zero */
 	/* bits 100:96 */
-	int int_both:1;		/* if 1, interrupt both sockets on the blade */
+	unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
 	/* bit 101*/
-	int fairness:3;		/* usually zero */
+	unsigned int fairness:3;/* usually zero */
 	/* bits 104:102 */
-	int multilevel:1;	/* multi-level multicast format */
+	unsigned int multilevel:1;	/* multi-level multicast format */
 	/* bit 105 */
 				/* 0 for TLB: endpoint multi-unicast messages */
-	int chaining:1;		/* next descriptor is part of this activation*/
+	unsigned int chaining:1;/* next descriptor is part of this activation*/
 	/* bit 106 */
-	int rsvd_7:21;		/* must be zero */
+	unsigned int rsvd_7:21;	/* must be zero */
 	/* bits 127:107 */
 };
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 65d0b72777ea..29dc0c89d4af 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -538,9 +538,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	union acpi_object *obj;
 	struct acpi_madt_local_apic *lapic;
-	cpumask_t tmp_map, new_map;
+	cpumask_var_t tmp_map, new_map;
 	u8 physid;
 	int cpu;
+	int retval = -ENOMEM;
 
 	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
 		return -EINVAL;
@@ -569,23 +570,37 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
 	buffer.length = ACPI_ALLOCATE_BUFFER;
 	buffer.pointer = NULL;
 
-	tmp_map = cpu_present_map;
+	if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
+		goto out;
+
+	if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
+		goto free_tmp_map;
+
+	cpumask_copy(tmp_map, cpu_present_mask);
 	acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
 
 	/*
 	 * If mp_register_lapic successfully generates a new logical cpu
 	 * number, then the following will get us exactly what was mapped
 	 */
-	cpus_andnot(new_map, cpu_present_map, tmp_map);
-	if (cpus_empty(new_map)) {
+	cpumask_andnot(new_map, cpu_present_mask, tmp_map);
+	if (cpumask_empty(new_map)) {
 		printk ("Unable to map lapic to logical cpu number\n");
-		return -EINVAL;
+		retval = -EINVAL;
+		goto free_new_map;
 	}
 
-	cpu = first_cpu(new_map);
+	cpu = cpumask_first(new_map);
 
 	*pcpu = cpu;
-	return 0;
+	retval = 0;
+
+free_new_map:
+	free_cpumask_var(new_map);
+free_tmp_map:
+	free_cpumask_var(tmp_map);
+out:
+	return retval;
 }
 
 /* wrapper to silence section mismatch warning */
@@ -598,7 +613,7 @@ EXPORT_SYMBOL(acpi_map_lsapic);
 int acpi_unmap_lsapic(int cpu)
 {
 	per_cpu(x86_cpu_to_apicid, cpu) = -1;
-	cpu_clear(cpu, cpu_present_map);
+	set_cpu_present(cpu, false);
 	num_processors--;
 
 	return (0);
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 2e2da717b350..5113c080f0c4 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -20,8 +20,12 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
+#ifdef CONFIG_IOMMU_API
+#include <linux/iommu.h>
+#endif
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -38,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 static LIST_HEAD(iommu_pd_list);
 static DEFINE_SPINLOCK(iommu_pd_list_lock);
 
+#ifdef CONFIG_IOMMU_API
+static struct iommu_ops amd_iommu_ops;
+#endif
+
 /*
  * general struct to manage commands send to an IOMMU
  */
@@ -47,6 +55,68 @@ struct iommu_cmd {
 
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
+static struct dma_ops_domain *find_protection_domain(u16 devid);
+
+
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+/*
+ * Initialization code for statistics collection
+ */
+
+DECLARE_STATS_COUNTER(compl_wait);
+DECLARE_STATS_COUNTER(cnt_map_single);
+DECLARE_STATS_COUNTER(cnt_unmap_single);
+DECLARE_STATS_COUNTER(cnt_map_sg);
+DECLARE_STATS_COUNTER(cnt_unmap_sg);
+DECLARE_STATS_COUNTER(cnt_alloc_coherent);
+DECLARE_STATS_COUNTER(cnt_free_coherent);
+DECLARE_STATS_COUNTER(cross_page);
+DECLARE_STATS_COUNTER(domain_flush_single);
+DECLARE_STATS_COUNTER(domain_flush_all);
+DECLARE_STATS_COUNTER(alloced_io_mem);
+DECLARE_STATS_COUNTER(total_map_requests);
+
+static struct dentry *stats_dir;
+static struct dentry *de_isolate;
+static struct dentry *de_fflush;
+
+static void amd_iommu_stats_add(struct __iommu_counter *cnt)
+{
+	if (stats_dir == NULL)
+		return;
+
+	cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
+				       &cnt->value);
+}
+
+static void amd_iommu_stats_init(void)
+{
+	stats_dir = debugfs_create_dir("amd-iommu", NULL);
+	if (stats_dir == NULL)
+		return;
+
+	de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
+					 (u32 *)&amd_iommu_isolate);
+
+	de_fflush  = debugfs_create_bool("fullflush", 0444, stats_dir,
+					 (u32 *)&amd_iommu_unmap_flush);
+
+	amd_iommu_stats_add(&compl_wait);
+	amd_iommu_stats_add(&cnt_map_single);
+	amd_iommu_stats_add(&cnt_unmap_single);
+	amd_iommu_stats_add(&cnt_map_sg);
+	amd_iommu_stats_add(&cnt_unmap_sg);
+	amd_iommu_stats_add(&cnt_alloc_coherent);
+	amd_iommu_stats_add(&cnt_free_coherent);
+	amd_iommu_stats_add(&cross_page);
+	amd_iommu_stats_add(&domain_flush_single);
+	amd_iommu_stats_add(&domain_flush_all);
+	amd_iommu_stats_add(&alloced_io_mem);
+	amd_iommu_stats_add(&total_map_requests);
+}
+
+#endif
 
 /* returns !0 if the IOMMU is caching non-present entries in its TLB */
 static int iommu_has_npcache(struct amd_iommu *iommu)
@@ -189,13 +259,55 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 	spin_lock_irqsave(&iommu->lock, flags);
 	ret = __iommu_queue_command(iommu, cmd);
 	if (!ret)
-		iommu->need_sync = 1;
+		iommu->need_sync = true;
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	return ret;
 }
 
 /*
+ * This function waits until an IOMMU has completed a completion
+ * wait command
+ */
+static void __iommu_wait_for_completion(struct amd_iommu *iommu)
+{
+	int ready = 0;
+	unsigned status = 0;
+	unsigned long i = 0;
+
+	INC_STATS_COUNTER(compl_wait);
+
+	while (!ready && (i < EXIT_LOOP_COUNT)) {
+		++i;
+		/* wait for the bit to become one */
+		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+		ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
+	}
+
+	/* set bit back to zero */
+	status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
+	writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+	if (unlikely(i == EXIT_LOOP_COUNT))
+		panic("AMD IOMMU: Completion wait loop failed\n");
+}
+
+/*
+ * This function queues a completion wait command into the command
+ * buffer of an IOMMU
+ */
+static int __iommu_completion_wait(struct amd_iommu *iommu)
+{
+	struct iommu_cmd cmd;
+
+	 memset(&cmd, 0, sizeof(cmd));
+	 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
+	 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+	 return __iommu_queue_command(iommu, &cmd);
+}
+
+/*
  * This function is called whenever we need to ensure that the IOMMU has
  * completed execution of all commands we sent. It sends a
  * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
@@ -204,40 +316,22 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
  */
 static int iommu_completion_wait(struct amd_iommu *iommu)
 {
-	int ret = 0, ready = 0;
-	unsigned status = 0;
-	struct iommu_cmd cmd;
-	unsigned long flags, i = 0;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
-	CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+	int ret = 0;
+	unsigned long flags;
 
 	spin_lock_irqsave(&iommu->lock, flags);
 
 	if (!iommu->need_sync)
 		goto out;
 
-	iommu->need_sync = 0;
+	ret = __iommu_completion_wait(iommu);
 
-	ret = __iommu_queue_command(iommu, &cmd);
+	iommu->need_sync = false;
 
 	if (ret)
 		goto out;
 
-	while (!ready && (i < EXIT_LOOP_COUNT)) {
-		++i;
-		/* wait for the bit to become one */
-		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
-		ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
-	}
-
-	/* set bit back to zero */
-	status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
-	writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
-	if (unlikely(i == EXIT_LOOP_COUNT))
-		panic("AMD IOMMU: Completion wait loop failed\n");
+	__iommu_wait_for_completion(iommu);
 
 out:
 	spin_unlock_irqrestore(&iommu->lock, flags);
@@ -264,6 +358,21 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
 	return ret;
 }
 
+static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+					  u16 domid, int pde, int s)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	address &= PAGE_MASK;
+	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+	cmd->data[1] |= domid;
+	cmd->data[2] = lower_32_bits(address);
+	cmd->data[3] = upper_32_bits(address);
+	if (s) /* size bit - we flush more than one 4kb page */
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
 /*
  * Generic command send function for invalidaing TLB entries
  */
@@ -273,16 +382,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 	struct iommu_cmd cmd;
 	int ret;
 
-	memset(&cmd, 0, sizeof(cmd));
-	address &= PAGE_MASK;
-	CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
-	cmd.data[1] |= domid;
-	cmd.data[2] = lower_32_bits(address);
-	cmd.data[3] = upper_32_bits(address);
-	if (s) /* size bit - we flush more than one 4kb page */
-		cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
-		cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+	__iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
 
 	ret = iommu_queue_command(iommu, &cmd);
 
@@ -321,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
 {
 	u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
 
+	INC_STATS_COUNTER(domain_flush_single);
+
 	iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
 }
 
+/*
+ * This function is used to flush the IO/TLB for a given protection domain
+ * on every IOMMU in the system
+ */
+static void iommu_flush_domain(u16 domid)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct iommu_cmd cmd;
+
+	INC_STATS_COUNTER(domain_flush_all);
+
+	__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+				      domid, 1, 1);
+
+	list_for_each_entry(iommu, &amd_iommu_list, list) {
+		spin_lock_irqsave(&iommu->lock, flags);
+		__iommu_queue_command(iommu, &cmd);
+		__iommu_completion_wait(iommu);
+		__iommu_wait_for_completion(iommu);
+		spin_unlock_irqrestore(&iommu->lock, flags);
+	}
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
@@ -338,10 +464,10 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
  * supporting all features of AMD IOMMU page tables like level skipping
  * and full 64 bit address spaces.
  */
-static int iommu_map(struct protection_domain *dom,
-		     unsigned long bus_addr,
-		     unsigned long phys_addr,
-		     int prot)
+static int iommu_map_page(struct protection_domain *dom,
+			  unsigned long bus_addr,
+			  unsigned long phys_addr,
+			  int prot)
 {
 	u64 __pte, *pte, *page;
 
@@ -388,6 +514,28 @@ static int iommu_map(struct protection_domain *dom,
 	return 0;
 }
 
+static void iommu_unmap_page(struct protection_domain *dom,
+			     unsigned long bus_addr)
+{
+	u64 *pte;
+
+	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+	*pte = 0;
+}
+
 /*
  * This function checks if a specific unity mapping entry is needed for
  * this specific IOMMU.
@@ -440,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 
 	for (addr = e->address_start; addr < e->address_end;
 	     addr += PAGE_SIZE) {
-		ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
 		if (ret)
 			return ret;
 		/*
@@ -571,6 +719,16 @@ static u16 domain_id_alloc(void)
 	return id;
 }
 
+static void domain_id_free(int id)
+{
+	unsigned long flags;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	if (id > 0 && id < MAX_DOMAIN_ID)
+		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
 /*
  * Used to reserve address ranges in the aperture (e.g. for exclusion
  * ranges.
@@ -587,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 	iommu_area_reserve(dom->bitmap, start_page, pages);
 }
 
-static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+static void free_pagetable(struct protection_domain *domain)
 {
 	int i, j;
 	u64 *p1, *p2, *p3;
 
-	p1 = dma_dom->domain.pt_root;
+	p1 = domain->pt_root;
 
 	if (!p1)
 		return;
@@ -613,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
 	}
 
 	free_page((unsigned long)p1);
+
+	domain->pt_root = NULL;
 }
 
 /*
@@ -624,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
 	if (!dom)
 		return;
 
-	dma_ops_free_pagetable(dom);
+	free_pagetable(&dom->domain);
 
 	kfree(dom->pte_pages);
 
@@ -663,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 		goto free_dma_dom;
 	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
 	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+	dma_dom->domain.flags = PD_DMA_OPS_MASK;
 	dma_dom->domain.priv = dma_dom;
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
@@ -725,6 +886,15 @@ free_dma_dom:
 }
 
 /*
+ * little helper function to check whether a given protection domain is a
+ * dma_ops domain
+ */
+static bool dma_ops_domain(struct protection_domain *domain)
+{
+	return domain->flags & PD_DMA_OPS_MASK;
+}
+
+/*
  * Find out the protection domain structure for a given PCI device. This
  * will give us the pointer to the page table root for example.
  */
@@ -744,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid)
  * If a device is not yet associated with a domain, this function does
  * assigns it visible for the hardware
  */
-static void set_device_domain(struct amd_iommu *iommu,
-			      struct protection_domain *domain,
-			      u16 devid)
+static void attach_device(struct amd_iommu *iommu,
+			  struct protection_domain *domain,
+			  u16 devid)
 {
 	unsigned long flags;
-
 	u64 pte_root = virt_to_phys(domain->pt_root);
 
+	domain->dev_cnt += 1;
+
 	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
 		    << DEV_ENTRY_MODE_SHIFT;
 	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
@@ -767,6 +938,116 @@ static void set_device_domain(struct amd_iommu *iommu,
 	iommu_queue_inv_dev_entry(iommu, devid);
 }
 
+/*
+ * Removes a device from a protection domain (unlocked)
+ */
+static void __detach_device(struct protection_domain *domain, u16 devid)
+{
+
+	/* lock domain */
+	spin_lock(&domain->lock);
+
+	/* remove domain from the lookup table */
+	amd_iommu_pd_table[devid] = NULL;
+
+	/* remove entry from the device table seen by the hardware */
+	amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
+	amd_iommu_dev_table[devid].data[1] = 0;
+	amd_iommu_dev_table[devid].data[2] = 0;
+
+	/* decrease reference counter */
+	domain->dev_cnt -= 1;
+
+	/* ready */
+	spin_unlock(&domain->lock);
+}
+
+/*
+ * Removes a device from a protection domain (with devtable_lock held)
+ */
+static void detach_device(struct protection_domain *domain, u16 devid)
+{
+	unsigned long flags;
+
+	/* lock device table */
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	__detach_device(domain, devid);
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int device_change_notifier(struct notifier_block *nb,
+				  unsigned long action, void *data)
+{
+	struct device *dev = data;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
+	struct protection_domain *domain;
+	struct dma_ops_domain *dma_domain;
+	struct amd_iommu *iommu;
+	int order = amd_iommu_aperture_order;
+	unsigned long flags;
+
+	if (devid > amd_iommu_last_bdf)
+		goto out;
+
+	devid = amd_iommu_alias_table[devid];
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (iommu == NULL)
+		goto out;
+
+	domain = domain_for_device(devid);
+
+	if (domain && !dma_ops_domain(domain))
+		WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
+			  "to a non-dma-ops domain\n", dev_name(dev));
+
+	switch (action) {
+	case BUS_NOTIFY_BOUND_DRIVER:
+		if (domain)
+			goto out;
+		dma_domain = find_protection_domain(devid);
+		if (!dma_domain)
+			dma_domain = iommu->default_dom;
+		attach_device(iommu, &dma_domain->domain, devid);
+		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+		       "device %s\n", dma_domain->domain.id, dev_name(dev));
+		break;
+	case BUS_NOTIFY_UNBIND_DRIVER:
+		if (!domain)
+			goto out;
+		detach_device(domain, devid);
+		break;
+	case BUS_NOTIFY_ADD_DEVICE:
+		/* allocate a protection domain if a device is added */
+		dma_domain = find_protection_domain(devid);
+		if (dma_domain)
+			goto out;
+		dma_domain = dma_ops_domain_alloc(iommu, order);
+		if (!dma_domain)
+			goto out;
+		dma_domain->target_dev = devid;
+
+		spin_lock_irqsave(&iommu_pd_list_lock, flags);
+		list_add_tail(&dma_domain->list, &iommu_pd_list);
+		spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+		break;
+	default:
+		goto out;
+	}
+
+	iommu_queue_inv_dev_entry(iommu, devid);
+	iommu_completion_wait(iommu);
+
+out:
+	return 0;
+}
+
+struct notifier_block device_nb = {
+	.notifier_call = device_change_notifier,
+};
+
 /*****************************************************************************
  *
  * The next functions belong to the dma_ops mapping/unmapping code.
@@ -802,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid)
 	list_for_each_entry(entry, &iommu_pd_list, list) {
 		if (entry->target_dev == devid) {
 			ret = entry;
-			list_del(&ret->list);
 			break;
 		}
 	}
@@ -853,14 +1133,13 @@ static int get_device_resources(struct device *dev,
 		if (!dma_dom)
 			dma_dom = (*iommu)->default_dom;
 		*domain = &dma_dom->domain;
-		set_device_domain(*iommu, *domain, *bdf);
+		attach_device(*iommu, *domain, *bdf);
 		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-				"device ", (*domain)->id);
-		print_devid(_bdf, 1);
+				"device %s\n", (*domain)->id, dev_name(dev));
 	}
 
 	if (domain_for_device(_bdf) == NULL)
-		set_device_domain(*iommu, *domain, _bdf);
+		attach_device(*iommu, *domain, _bdf);
 
 	return 1;
 }
@@ -946,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev,
 	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
 	paddr &= PAGE_MASK;
 
+	INC_STATS_COUNTER(total_map_requests);
+
+	if (pages > 1)
+		INC_STATS_COUNTER(cross_page);
+
 	if (align)
 		align_mask = (1UL << get_order(size)) - 1;
 
@@ -962,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev,
 	}
 	address += offset;
 
+	ADD_STATS_COUNTER(alloced_io_mem, size);
+
 	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
 		iommu_flush_tlb(iommu, dma_dom->domain.id);
 		dma_dom->need_flush = false;
@@ -998,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu,
 		start += PAGE_SIZE;
 	}
 
+	SUB_STATS_COUNTER(alloced_io_mem, size);
+
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
 
 	if (amd_iommu_unmap_flush || dma_dom->need_flush) {
@@ -1019,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 	dma_addr_t addr;
 	u64 dma_mask;
 
+	INC_STATS_COUNTER(cnt_map_single);
+
 	if (!check_device(dev))
 		return bad_dma_address;
 
@@ -1030,6 +1320,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 		/* device not handled by any AMD IOMMU */
 		return (dma_addr_t)paddr;
 
+	if (!dma_ops_domain(domain))
+		return bad_dma_address;
+
 	spin_lock_irqsave(&domain->lock, flags);
 	addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
 			    dma_mask);
@@ -1055,11 +1348,16 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 	struct protection_domain *domain;
 	u16 devid;
 
+	INC_STATS_COUNTER(cnt_unmap_single);
+
 	if (!check_device(dev) ||
 	    !get_device_resources(dev, &iommu, &domain, &devid))
 		/* device not handled by any AMD IOMMU */
 		return;
 
+	if (!dma_ops_domain(domain))
+		return;
+
 	spin_lock_irqsave(&domain->lock, flags);
 
 	__unmap_single(iommu, domain->priv, dma_addr, size, dir);
@@ -1104,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
 	int mapped_elems = 0;
 	u64 dma_mask;
 
+	INC_STATS_COUNTER(cnt_map_sg);
+
 	if (!check_device(dev))
 		return 0;
 
@@ -1114,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
 	if (!iommu || !domain)
 		return map_sg_no_iommu(dev, sglist, nelems, dir);
 
+	if (!dma_ops_domain(domain))
+		return 0;
+
 	spin_lock_irqsave(&domain->lock, flags);
 
 	for_each_sg(sglist, s, nelems, i) {
@@ -1163,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 	u16 devid;
 	int i;
 
+	INC_STATS_COUNTER(cnt_unmap_sg);
+
 	if (!check_device(dev) ||
 	    !get_device_resources(dev, &iommu, &domain, &devid))
 		return;
 
+	if (!dma_ops_domain(domain))
+		return;
+
 	spin_lock_irqsave(&domain->lock, flags);
 
 	for_each_sg(sglist, s, nelems, i) {
@@ -1194,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	phys_addr_t paddr;
 	u64 dma_mask = dev->coherent_dma_mask;
 
+	INC_STATS_COUNTER(cnt_alloc_coherent);
+
 	if (!check_device(dev))
 		return NULL;
 
@@ -1212,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size,
 		return virt_addr;
 	}
 
+	if (!dma_ops_domain(domain))
+		goto out_free;
+
 	if (!dma_mask)
 		dma_mask = *dev->dma_mask;
 
@@ -1220,18 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
 				 size, DMA_BIDIRECTIONAL, true, dma_mask);
 
-	if (*dma_addr == bad_dma_address) {
-		free_pages((unsigned long)virt_addr, get_order(size));
-		virt_addr = NULL;
-		goto out;
-	}
+	if (*dma_addr == bad_dma_address)
+		goto out_free;
 
 	iommu_completion_wait(iommu);
 
-out:
 	spin_unlock_irqrestore(&domain->lock, flags);
 
 	return virt_addr;
+
+out_free:
+
+	free_pages((unsigned long)virt_addr, get_order(size));
+
+	return NULL;
 }
 
 /*
@@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size,
 	struct protection_domain *domain;
 	u16 devid;
 
+	INC_STATS_COUNTER(cnt_free_coherent);
+
 	if (!check_device(dev))
 		return;
 
@@ -1253,6 +1570,9 @@ static void free_coherent(struct device *dev, size_t size,
 	if (!iommu || !domain)
 		goto free_mem;
 
+	if (!dma_ops_domain(domain))
+		goto free_mem;
+
 	spin_lock_irqsave(&domain->lock, flags);
 
 	__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
@@ -1296,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
  * we don't need to preallocate the protection domains anymore.
  * For now we have to.
  */
-void prealloc_protection_domains(void)
+static void prealloc_protection_domains(void)
 {
 	struct pci_dev *dev = NULL;
 	struct dma_ops_domain *dma_dom;
@@ -1305,7 +1625,7 @@ void prealloc_protection_domains(void)
 	u16 devid;
 
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-		devid = (dev->bus->number << 8) | dev->devfn;
+		devid = calc_devid(dev->bus->number, dev->devfn);
 		if (devid > amd_iommu_last_bdf)
 			continue;
 		devid = amd_iommu_alias_table[devid];
@@ -1352,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
 		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
 		if (iommu->default_dom == NULL)
 			return -ENOMEM;
+		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
 		ret = iommu_init_unity_mappings(iommu);
 		if (ret)
 			goto free_domains;
@@ -1375,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void)
 	/* Make the driver finally visible to the drivers */
 	dma_ops = &amd_iommu_dma_ops;
 
+	register_iommu(&amd_iommu_ops);
+
+	bus_register_notifier(&pci_bus_type, &device_nb);
+
+	amd_iommu_stats_init();
+
 	return 0;
 
 free_domains:
@@ -1386,3 +1713,224 @@ free_domains:
 
 	return ret;
 }
+
+/*****************************************************************************
+ *
+ * The following functions belong to the exported interface of AMD IOMMU
+ *
+ * This interface allows access to lower level functions of the IOMMU
+ * like protection domain handling and assignement of devices to domains
+ * which is not possible with the dma_ops interface.
+ *
+ *****************************************************************************/
+
+static void cleanup_domain(struct protection_domain *domain)
+{
+	unsigned long flags;
+	u16 devid;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
+		if (amd_iommu_pd_table[devid] == domain)
+			__detach_device(domain, devid);
+
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+	struct protection_domain *domain;
+
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return -ENOMEM;
+
+	spin_lock_init(&domain->lock);
+	domain->mode = PAGE_MODE_3_LEVEL;
+	domain->id = domain_id_alloc();
+	if (!domain->id)
+		goto out_free;
+	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!domain->pt_root)
+		goto out_free;
+
+	dom->priv = domain;
+
+	return 0;
+
+out_free:
+	kfree(domain);
+
+	return -ENOMEM;
+}
+
+static void amd_iommu_domain_destroy(struct iommu_domain *dom)
+{
+	struct protection_domain *domain = dom->priv;
+
+	if (!domain)
+		return;
+
+	if (domain->dev_cnt > 0)
+		cleanup_domain(domain);
+
+	BUG_ON(domain->dev_cnt != 0);
+
+	free_pagetable(domain);
+
+	domain_id_free(domain->id);
+
+	kfree(domain);
+
+	dom->priv = NULL;
+}
+
+static void amd_iommu_detach_device(struct iommu_domain *dom,
+				    struct device *dev)
+{
+	struct protection_domain *domain = dom->priv;
+	struct amd_iommu *iommu;
+	struct pci_dev *pdev;
+	u16 devid;
+
+	if (dev->bus != &pci_bus_type)
+		return;
+
+	pdev = to_pci_dev(dev);
+
+	devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+	if (devid > 0)
+		detach_device(domain, devid);
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (!iommu)
+		return;
+
+	iommu_queue_inv_dev_entry(iommu, devid);
+	iommu_completion_wait(iommu);
+}
+
+static int amd_iommu_attach_device(struct iommu_domain *dom,
+				   struct device *dev)
+{
+	struct protection_domain *domain = dom->priv;
+	struct protection_domain *old_domain;
+	struct amd_iommu *iommu;
+	struct pci_dev *pdev;
+	u16 devid;
+
+	if (dev->bus != &pci_bus_type)
+		return -EINVAL;
+
+	pdev = to_pci_dev(dev);
+
+	devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+	if (devid >= amd_iommu_last_bdf ||
+			devid != amd_iommu_alias_table[devid])
+		return -EINVAL;
+
+	iommu = amd_iommu_rlookup_table[devid];
+	if (!iommu)
+		return -EINVAL;
+
+	old_domain = domain_for_device(devid);
+	if (old_domain)
+		return -EBUSY;
+
+	attach_device(iommu, domain, devid);
+
+	iommu_completion_wait(iommu);
+
+	return 0;
+}
+
+static int amd_iommu_map_range(struct iommu_domain *dom,
+			       unsigned long iova, phys_addr_t paddr,
+			       size_t size, int iommu_prot)
+{
+	struct protection_domain *domain = dom->priv;
+	unsigned long i,  npages = iommu_num_pages(paddr, size, PAGE_SIZE);
+	int prot = 0;
+	int ret;
+
+	if (iommu_prot & IOMMU_READ)
+		prot |= IOMMU_PROT_IR;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= IOMMU_PROT_IW;
+
+	iova  &= PAGE_MASK;
+	paddr &= PAGE_MASK;
+
+	for (i = 0; i < npages; ++i) {
+		ret = iommu_map_page(domain, iova, paddr, prot);
+		if (ret)
+			return ret;
+
+		iova  += PAGE_SIZE;
+		paddr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+static void amd_iommu_unmap_range(struct iommu_domain *dom,
+				  unsigned long iova, size_t size)
+{
+
+	struct protection_domain *domain = dom->priv;
+	unsigned long i,  npages = iommu_num_pages(iova, size, PAGE_SIZE);
+
+	iova  &= PAGE_MASK;
+
+	for (i = 0; i < npages; ++i) {
+		iommu_unmap_page(domain, iova);
+		iova  += PAGE_SIZE;
+	}
+
+	iommu_flush_domain(domain->id);
+}
+
+static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
+					  unsigned long iova)
+{
+	struct protection_domain *domain = dom->priv;
+	unsigned long offset = iova & ~PAGE_MASK;
+	phys_addr_t paddr;
+	u64 *pte;
+
+	pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return 0;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return 0;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return 0;
+
+	paddr  = *pte & IOMMU_PAGE_MASK;
+	paddr |= offset;
+
+	return paddr;
+}
+
+static struct iommu_ops amd_iommu_ops = {
+	.domain_init = amd_iommu_domain_init,
+	.domain_destroy = amd_iommu_domain_destroy,
+	.attach_dev = amd_iommu_attach_device,
+	.detach_dev = amd_iommu_detach_device,
+	.map = amd_iommu_map_range,
+	.unmap = amd_iommu_unmap_range,
+	.iova_to_phys = amd_iommu_iova_to_phys,
+};
+
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c625800c55ca..42c33cebf00f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -122,7 +122,8 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
 unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
-int amd_iommu_isolate = 1;		/* if 1, device isolation is enabled */
+bool amd_iommu_isolate = true;		/* if true, device isolation is
+					   enabled */
 bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
 
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
@@ -243,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 }
 
 /* Function to enable the hardware */
-void __init iommu_enable(struct amd_iommu *iommu)
+static void __init iommu_enable(struct amd_iommu *iommu)
 {
-	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
-	       "at %02x:%02x.%x cap 0x%hx\n",
-	       iommu->dev->bus->number,
-	       PCI_SLOT(iommu->dev->devfn),
-	       PCI_FUNC(iommu->dev->devfn),
-	       iommu->cap_ptr);
+	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
 
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
 /* Function to enable IOMMU event logging and event interrupts */
-void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
 {
 	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
 	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
@@ -1218,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str)
 {
 	for (; *str; ++str) {
 		if (strncmp(str, "isolate", 7) == 0)
-			amd_iommu_isolate = 1;
+			amd_iommu_isolate = true;
 		if (strncmp(str, "share", 5) == 0)
-			amd_iommu_isolate = 0;
+			amd_iommu_isolate = false;
 		if (strncmp(str, "fullflush", 9) == 0)
 			amd_iommu_unmap_flush = true;
 	}
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 6b7f824db160..b13d3c4dbd42 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -98,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer);
 #ifdef HAVE_X2APIC
 int x2apic;
 /* x2apic enabled before OS handover */
-int x2apic_preenabled;
-int disable_x2apic;
+static int x2apic_preenabled;
+static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
 	disable_x2apic = 1;
@@ -140,7 +140,7 @@ static int lapic_next_event(unsigned long delta,
 			    struct clock_event_device *evt);
 static void lapic_timer_setup(enum clock_event_mode mode,
 			      struct clock_event_device *evt);
-static void lapic_timer_broadcast(const cpumask_t *mask);
+static void lapic_timer_broadcast(const struct cpumask *mask);
 static void apic_pm_activate(void);
 
 /*
@@ -226,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id)
 	apic_write(APIC_ICR, low);
 }
 
-u64 xapic_icr_read(void)
+static u64 xapic_icr_read(void)
 {
 	u32 icr1, icr2;
 
@@ -266,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id)
 	wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
 }
 
-u64 x2apic_icr_read(void)
+static u64 x2apic_icr_read(void)
 {
 	unsigned long val;
 
@@ -453,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 /*
  * Local APIC timer broadcast function
  */
-static void lapic_timer_broadcast(const cpumask_t *mask)
+static void lapic_timer_broadcast(const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
 	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 2a0a2a3cac26..f63882728d91 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,7 +25,7 @@
 #include <asm/uv/bios.h>
 #include <asm/uv/uv_hub.h>
 
-struct uv_systab uv_systab;
+static struct uv_systab uv_systab;
 
 s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
 {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 42e0853030cb..3f95a40f718a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -355,7 +355,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
 	} else if (smp_num_siblings > 1) {
 
-		if (smp_num_siblings > NR_CPUS) {
+		if (smp_num_siblings > nr_cpu_ids) {
 			printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
 					smp_num_siblings);
 			smp_num_siblings = 1;
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 88ea02dcb622..28102ad1a363 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -517,6 +517,17 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
 	}
 }
 
+static void free_acpi_perf_data(void)
+{
+	unsigned int i;
+
+	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
+	for_each_possible_cpu(i)
+		free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
+				 ->shared_cpu_map);
+	free_percpu(acpi_perf_data);
+}
+
 /*
  * acpi_cpufreq_early_init - initialize ACPI P-States library
  *
@@ -527,6 +538,7 @@ acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
  */
 static int __init acpi_cpufreq_early_init(void)
 {
+	unsigned int i;
 	dprintk("acpi_cpufreq_early_init\n");
 
 	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
@@ -534,6 +546,16 @@ static int __init acpi_cpufreq_early_init(void)
 		dprintk("Memory allocation error for acpi_perf_data.\n");
 		return -ENOMEM;
 	}
+	for_each_possible_cpu(i) {
+		if (!alloc_cpumask_var_node(
+			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
+			GFP_KERNEL, cpu_to_node(i))) {
+
+			/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
+			free_acpi_perf_data();
+			return -ENOMEM;
+		}
+	}
 
 	/* Do initialization in ACPI core */
 	acpi_processor_preregister_performance(acpi_perf_data);
@@ -604,9 +626,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	 */
 	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
 	    policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
-		policy->cpus = perf->shared_cpu_map;
+		cpumask_copy(&policy->cpus, perf->shared_cpu_map);
 	}
-	policy->related_cpus = perf->shared_cpu_map;
+	cpumask_copy(&policy->related_cpus, perf->shared_cpu_map);
 
 #ifdef CONFIG_SMP
 	dmi_check_system(sw_any_bug_dmi_table);
@@ -795,7 +817,7 @@ static int __init acpi_cpufreq_init(void)
 
 	ret = cpufreq_register_driver(&acpi_cpufreq_driver);
 	if (ret)
-		free_percpu(acpi_perf_data);
+		free_acpi_perf_data();
 
 	return ret;
 }
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 7c7d56b43136..1b446d79a8fd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -310,6 +310,12 @@ static int powernow_acpi_init(void)
 		goto err0;
 	}
 
+	if (!alloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
+								GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto err05;
+	}
+
 	if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
 		retval = -EIO;
 		goto err1;
@@ -412,6 +418,8 @@ static int powernow_acpi_init(void)
 err2:
 	acpi_processor_unregister_performance(acpi_processor_perf, 0);
 err1:
+	free_cpumask_var(acpi_processor_perf->shared_cpu_map);
+err05:
 	kfree(acpi_processor_perf);
 err0:
 	printk(KERN_WARNING PFX "ACPI perflib can not be used in this platform\n");
@@ -652,6 +660,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) {
 #ifdef CONFIG_X86_POWERNOW_K7_ACPI
 	if (acpi_processor_perf) {
 		acpi_processor_unregister_performance(acpi_processor_perf, 0);
+		free_cpumask_var(acpi_processor_perf->shared_cpu_map);
 		kfree(acpi_processor_perf);
 	}
 #endif
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 7f05f44b97e9..c3c9adbaa26f 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -766,7 +766,7 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned
 static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 {
 	struct cpufreq_frequency_table *powernow_table;
-	int ret_val;
+	int ret_val = -ENODEV;
 
 	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
 		dprintk("register performance failed: bad ACPI data\n");
@@ -815,6 +815,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 	/* notify BIOS that we exist */
 	acpi_processor_notify_smm(THIS_MODULE);
 
+	if (!alloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
+		printk(KERN_ERR PFX
+				"unable to alloc powernow_k8_data cpumask\n");
+		ret_val = -ENOMEM;
+		goto err_out_mem;
+	}
+
 	return 0;
 
 err_out_mem:
@@ -826,7 +833,7 @@ err_out:
 	/* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
 	data->acpi_data.state_count = 0;
 
-	return -ENODEV;
+	return ret_val;
 }
 
 static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table)
@@ -929,6 +936,7 @@ static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
 {
 	if (data->acpi_data.state_count)
 		acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+	free_cpumask_var(data->acpi_data.shared_cpu_map);
 }
 
 #else
@@ -1134,7 +1142,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	data->cpu = pol->cpu;
 	data->currpstate = HW_PSTATE_INVALID;
 
-	if (powernow_k8_cpu_init_acpi(data)) {
+	rc = powernow_k8_cpu_init_acpi(data);
+	if (rc) {
 		/*
 		 * Use the PSB BIOS structure. This is only availabe on
 		 * an UP version, and is deprecated by AMD.
@@ -1152,20 +1161,17 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 			       "ACPI maintainers and complain to your BIOS "
 			       "vendor.\n");
 #endif
-			kfree(data);
-			return -ENODEV;
+			goto err_out;
 		}
 		if (pol->cpu != 0) {
 			printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
 			       "CPU other than CPU0. Complain to your BIOS "
 			       "vendor.\n");
-			kfree(data);
-			return -ENODEV;
+			goto err_out;
 		}
 		rc = find_psb_table(data);
 		if (rc) {
-			kfree(data);
-			return -ENODEV;
+			goto err_out;
 		}
 	}
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c6ecda64f5f1..48533d77be78 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -534,7 +534,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
 	per_cpu(cpuid4_info, cpu) = NULL;
 }
 
-static void get_cpu_leaves(void *_retval)
+static void __cpuinit get_cpu_leaves(void *_retval)
 {
 	int j, *retval = _retval, cpu = smp_processor_id();
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index d6ec7ec30274..d259e5d2e054 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -824,16 +824,14 @@ static int enable_mtrr_cleanup __initdata =
 
 static int __init disable_mtrr_cleanup_setup(char *str)
 {
-	if (enable_mtrr_cleanup != -1)
-		enable_mtrr_cleanup = 0;
+	enable_mtrr_cleanup = 0;
 	return 0;
 }
 early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
 
 static int __init enable_mtrr_cleanup_setup(char *str)
 {
-	if (enable_mtrr_cleanup != -1)
-		enable_mtrr_cleanup = 1;
+	enable_mtrr_cleanup = 1;
 	return 0;
 }
 early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 72cefd1e649b..2ac1f0c2beb3 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -39,10 +39,10 @@
 #include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/uaccess.h>
 
 #include <asm/processor.h>
 #include <asm/msr.h>
-#include <asm/uaccess.h>
 #include <asm/system.h>
 
 static struct class *cpuid_class;
@@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
 }
 
 static ssize_t cpuid_read(struct file *file, char __user *buf,
-			  size_t count, loff_t * ppos)
+			  size_t count, loff_t *ppos)
 {
 	char __user *tmp = buf;
 	struct cpuid_regs cmd;
@@ -117,11 +117,11 @@ static int cpuid_open(struct inode *inode, struct file *file)
 	unsigned int cpu;
 	struct cpuinfo_x86 *c;
 	int ret = 0;
-	
+
 	lock_kernel();
 
 	cpu = iminor(file->f_path.dentry->d_inode);
-	if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+	if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 		ret = -ENXIO;	/* No such CPU */
 		goto out;
 	}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 23b138e31e9c..504ad198e4ad 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -886,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...)
 	va_list ap;
 
 	va_start(ap, fmt);
-	n = vscnprintf(buf, 512, fmt, ap);
+	n = vscnprintf(buf, sizeof(buf), fmt, ap);
 	early_console->write(early_console, buf, n);
 	va_end(ap);
 }
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index 62895cf315ff..21bcc0e098ba 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -161,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb)
 	return current_cpu_data.initial_apicid >> index_msb;
 }
 
-void x2apic_send_IPI_self(int vector)
+static void x2apic_send_IPI_self(int vector)
 {
 	apic_write(APIC_SELF_IPI, vector);
 }
 
-void init_x2apic_ldr(void)
+static void init_x2apic_ldr(void)
 {
 	return;
 }
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 388e05a5fc17..b9a4d8c4b935 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,7 +27,7 @@
 #include <asm/trampoline.h>
 
 /* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
+static struct x8664_pda _boot_cpu_pda;
 
 #ifdef CONFIG_SMP
 /*
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 62ecfc991e1e..3639442aa7a4 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -214,11 +214,11 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 
 	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
 	if (cfg) {
-		/* FIXME: needs alloc_cpumask_var_node() */
-		if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+		if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
 			kfree(cfg);
 			cfg = NULL;
-		} else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+		} else if (!alloc_cpumask_var_node(&cfg->old_domain,
+							  GFP_ATOMIC, node)) {
 			free_cpumask_var(cfg->domain);
 			kfree(cfg);
 			cfg = NULL;
@@ -706,7 +706,7 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 }
 
 #ifdef CONFIG_X86_64
-void io_apic_sync(struct irq_pin_list *entry)
+static void io_apic_sync(struct irq_pin_list *entry)
 {
 	/*
 	 * Synchronize the IO-APIC and the CPU by doing
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index eee32b43fee3..71f1d99a635d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,8 +12,8 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
 	if (err < 0)
 		return err;
 
-	for(i = 0; i < old->size; i++)
+	for (i = 0; i < old->size; i++)
 		write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
 	return 0;
 }
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index efc2f361fe85..666e43df51f9 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -13,8 +13,7 @@
 #include <asm/msr.h>
 #include <asm/acpi.h>
 #include <asm/mmconfig.h>
-
-#include "../pci/pci.h"
+#include <asm/pci_x86.h>
 
 struct pci_hostbridge_probe {
 	u32 bus;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 45e3b69808ba..c5c5b8df1dbc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -16,14 +16,14 @@
 #include <linux/bitops.h>
 #include <linux/acpi.h>
 #include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/acpi.h>
 
-#include <asm/smp.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/io_apic.h>
 #include <asm/proto.h>
-#include <asm/acpi.h>
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
 #include <asm/trampoline.h>
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 #endif
 
 	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-		 set_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+		set_bit(m->mpc_busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 			x86_quirks->mpc_oem_pci_bus(m);
 
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
 	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 82a7c7ed6d45..726266695b2c 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -136,7 +136,7 @@ static int msr_open(struct inode *inode, struct file *file)
 	lock_kernel();
 	cpu = iminor(file->f_path.dentry->d_inode);
 
-	if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+	if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 		ret = -ENXIO;	/* No such CPU */
 		goto out;
 	}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8bd1bf9622a7..45a09ccdc214 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -26,11 +26,10 @@
 #include <linux/kernel_stat.h>
 #include <linux/kdebug.h>
 #include <linux/smp.h>
+#include <linux/nmi.h>
 
 #include <asm/i8259.h>
 #include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/nmi.h>
 #include <asm/proto.h>
 #include <asm/timer.h>
 
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a35eaa379ff6..00c2bcd41463 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -52,7 +52,7 @@ static u32 *iommu_gatt_base;		/* Remapping table */
  * to trigger bugs with some popular PCI cards, in particular 3ware (but
  * has been also also seen with Qlogic at least).
  */
-int iommu_fullflush = 1;
+static int iommu_fullflush = 1;
 
 /* Allocation bitmap for the remapping area: */
 static DEFINE_SPINLOCK(iommu_bitmap_lock);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 39643b1df061..2b46eb41643b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
 #include <asm/proto.h>
 #include <asm/reboot_fixups.h>
 #include <asm/reboot.h>
+#include <asm/pci_x86.h>
 #include <asm/virtext.h>
 
 #ifdef CONFIG_X86_32
@@ -24,7 +25,6 @@
 
 #include <mach_ipi.h>
 
-
 /*
  * Power off function, if any
  */
@@ -501,7 +501,7 @@ void native_machine_shutdown(void)
 
 #ifdef CONFIG_X86_32
 	/* See if there has been given a command line override */
-	if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
+	if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
 		cpu_online(reboot_cpu))
 		reboot_cpu_id = reboot_cpu;
 #endif
@@ -511,7 +511,7 @@ void native_machine_shutdown(void)
 		reboot_cpu_id = smp_processor_id();
 
 	/* Make certain I only run on the appropriate processor */
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id));
+	set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
 
 	/* O.K Now that I'm on the appropriate processor,
 	 * stop all of the others.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0b63b08e7530..a4b619c33106 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -153,12 +153,10 @@ void __init setup_per_cpu_areas(void)
 	align = max_t(unsigned long, PAGE_SIZE, align);
 	size = roundup(old_size, align);
 
-	printk(KERN_INFO
-		"NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
-	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
-			  size);
+	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
 
 	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -169,22 +167,15 @@ void __init setup_per_cpu_areas(void)
 		if (!node_online(node) || !NODE_DATA(node)) {
 			ptr = __alloc_bootmem(size, align,
 					 __pa(MAX_DMA_ADDRESS));
-			printk(KERN_INFO
-			       "cpu %d has no node %d or node-local memory\n",
+			pr_info("cpu %d has no node %d or node-local memory\n",
 				cpu, node);
-			if (ptr)
-				printk(KERN_DEBUG
-					"per cpu data for cpu%d at %016lx\n",
-					 cpu, __pa(ptr));
-		}
-		else {
+			pr_debug("per cpu data for cpu%d at %016lx\n",
+				 cpu, __pa(ptr));
+		} else {
 			ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
 							__pa(MAX_DMA_ADDRESS));
-			if (ptr)
-				printk(KERN_DEBUG
-					"per cpu data for cpu%d on node%d "
-					"at %016lx\n",
-					cpu, node, __pa(ptr));
+			pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
+				cpu, node, __pa(ptr));
 		}
 #endif
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
@@ -339,25 +330,25 @@ static const cpumask_t cpu_mask_none;
 /*
  * Returns a pointer to the bitmask of CPUs on Node 'node'.
  */
-const cpumask_t *_node_to_cpumask_ptr(int node)
+const cpumask_t *cpumask_of_node(int node)
 {
 	if (node_to_cpumask_map == NULL) {
 		printk(KERN_WARNING
-			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
+			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
 			node);
 		dump_stack();
 		return (const cpumask_t *)&cpu_online_map;
 	}
 	if (node >= nr_node_ids) {
 		printk(KERN_WARNING
-			"_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
+			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
 			node, nr_node_ids);
 		dump_stack();
 		return &cpu_mask_none;
 	}
 	return &node_to_cpumask_map[node];
 }
-EXPORT_SYMBOL(_node_to_cpumask_ptr);
+EXPORT_SYMBOL(cpumask_of_node);
 
 /*
  * Returns a bitmask of CPUs on Node 'node'.
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 31869bf5fabd..6bd4d9b73870 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -496,7 +496,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 }
 
 /* maps the cpu to the sched domain representing multi-core */
-cpumask_t cpu_coregroup_map(int cpu)
+const struct cpumask *cpu_coregroup_mask(int cpu)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	/*
@@ -504,9 +504,14 @@ cpumask_t cpu_coregroup_map(int cpu)
 	 * And for power savings, we return cpu_core_map
 	 */
 	if (sched_mc_power_savings || sched_smt_power_savings)
-		return per_cpu(cpu_core_map, cpu);
+		return &per_cpu(cpu_core_map, cpu);
 	else
-		return c->llc_shared_map;
+		return &c->llc_shared_map;
+}
+
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	return *cpu_coregroup_mask(cpu);
 }
 
 static void impress_friends(void)
@@ -1149,7 +1154,7 @@ static void __init smp_cpu_index_default(void)
 	for_each_possible_cpu(i) {
 		c = &cpu_data(i);
 		/* mark all to hotplug */
-		c->cpu_index = NR_CPUS;
+		c->cpu_index = nr_cpu_ids;
 	}
 }
 
@@ -1293,6 +1298,8 @@ __init void prefill_possible_map(void)
 	else
 		possible = setup_possible_cpus;
 
+	total_cpus = max_t(int, possible, num_processors + disabled_cpus);
+
 	if (possible > CONFIG_NR_CPUS) {
 		printk(KERN_WARNING
 			"%d Processors exceeds NR_CPUS limit of %d\n",
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6a00e5faaa74..f885023167e0 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -582,7 +582,6 @@ static int __init uv_ptc_init(void)
 static struct bau_control * __init uv_table_bases_init(int blade, int node)
 {
 	int i;
-	int *ip;
 	struct bau_msg_status *msp;
 	struct bau_control *bau_tabp;
 
@@ -599,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
 		bau_cpubits_clear(&msp->seen_by, (int)
 				  uv_blade_nr_possible_cpus(blade));
 
-	bau_tabp->watching =
-	    kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
-	BUG_ON(!bau_tabp->watching);
-
-	for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
-		*ip = 0;
-
 	uv_bau_table_bases[blade] = bau_tabp;
 
 	return bau_tabp;
@@ -628,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu,
 		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
 		bcp->va_queue_first	= bau_tablesp->va_queue_first;
 		bcp->va_queue_last	= bau_tablesp->va_queue_last;
-		bcp->watching		= bau_tablesp->watching;
 		bcp->msg_statuses	= bau_tablesp->msg_statuses;
 		bcp->descriptor_base	= adp;
 	}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2d1f4c7e4052..ce6650eb64e9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = 8;
 
-	/* This is always a kernel trap and never fixable (and thus must
-	   never return). */
+	/*
+	 * This is always a kernel trap and never fixable (and thus must
+	 * never return).
+	 */
 	for (;;)
 		die(str, regs, error_code);
 }
@@ -520,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 }
 
 #ifdef CONFIG_X86_64
-/* Help handler running on IST stack to switch back to user stack
-   for scheduling or signal handling. The actual stack switch is done in
-   entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
 asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
 	struct pt_regs *regs = eregs;
@@ -532,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 	/* Exception from user space */
 	else if (user_mode(eregs))
 		regs = task_pt_regs(current);
-	/* Exception from kernel and interrupts are enabled. Move to
-	   kernel process stack. */
+	/*
+	 * Exception from kernel and interrupts are enabled. Move to
+	 * kernel process stack.
+	 */
 	else if (eregs->flags & X86_EFLAGS_IF)
 		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
 	if (eregs != regs)
@@ -685,12 +691,7 @@ void math_error(void __user *ip)
 	cwd = get_fpu_cwd(task);
 	swd = get_fpu_swd(task);
 
-	err = swd & ~cwd & 0x3f;
-
-#ifdef CONFIG_X86_32
-	if (!err)
-		return;
-#endif
+	err = swd & ~cwd;
 
 	if (err & 0x001) {	/* Invalid op */
 		/*
@@ -708,7 +709,11 @@ void math_error(void __user *ip)
 	} else if (err & 0x020) { /* Precision */
 		info.si_code = FPE_FLTRES;
 	} else {
-		info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
+		/*
+		 * If we're using IRQ 13, or supposedly even some trap 16
+		 * implementations, it's possible we get a spurious trap...
+		 */
+		return;		/* Spurious trap, no error */
 	}
 	force_sig_info(SIGFPE, &info, task);
 }
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 15c3e6999182..2b54fe002e94 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf)
  * Restore the extended state if present. Otherwise, restore the FP/SSE
  * state.
  */
-int restore_user_xstate(void __user *buf)
+static int restore_user_xstate(void __user *buf)
 {
 	struct _fpx_sw_bytes fx_sw_user;
 	u64 mask;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c02343594b4d..d3ec292f00f2 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
-ifeq ($(CONFIG_DMAR),y)
-common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+ifeq ($(CONFIG_IOMMU_API),y)
+common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
 endif
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e6aa8141dcd..cc17546a2406 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
 #include <asm/uaccess.h>
@@ -989,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = !tdp_enabled;
 		break;
 	case KVM_CAP_IOMMU:
-		r = intel_iommu_found();
+		r = iommu_found();
 		break;
 	default:
 		r = 0;
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 37b9ae4d44c5..df167f265622 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -133,29 +133,28 @@ void __init time_init_hook(void)
  **/
 void mca_nmi_hook(void)
 {
-	/* If I recall correctly, there's a whole bunch of other things that
+	/*
+	 * If I recall correctly, there's a whole bunch of other things that
 	 * we can do to check for NMI problems, but that's all I know about
 	 * at the moment.
 	 */
-
-	printk("NMI generated from unknown source!\n");
+	pr_warning("NMI generated from unknown source!\n");
 }
 #endif
 
 static __init int no_ipi_broadcast(char *str)
 {
 	get_option(&str, &no_broadcast);
-	printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
-											"IPI Broadcast");
+	pr_info("Using %s mode\n",
+		no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
 	return 1;
 }
-
 __setup("no_ipi_broadcast=", no_ipi_broadcast);
 
 static int __init print_ipi_mode(void)
 {
-	printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
-											"Shortcut");
+	pr_info("Using IPI %s mode\n",
+		no_broadcast ? "No-Shortcut" : "Shortcut");
 	return 0;
 }
 
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index a5bc05492b1e..9840b7ec749a 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -357,9 +357,8 @@ void __init find_smp_config(void)
 	printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id);
 
 	/* initialize the CPU structures (moved from smp_boot_cpus) */
-	for (i = 0; i < NR_CPUS; i++) {
+	for (i = 0; i < nr_cpu_ids; i++)
 		cpu_irq_affinity[i] = ~0;
-	}
 	cpu_online_map = cpumask_of_cpu(boot_cpu_id);
 
 	/* The boot CPU must be extended */
@@ -1227,7 +1226,7 @@ int setup_profiling_timer(unsigned int multiplier)
 	 * new values until the next timer interrupt in which they do process
 	 * accounting.
 	 */
-	for (i = 0; i < NR_CPUS; ++i)
+	for (i = 0; i < nr_cpu_ids; ++i)
 		per_cpu(prof_multiplier, i) = multiplier;
 
 	return 0;
@@ -1257,7 +1256,7 @@ void __init voyager_smp_intr_init(void)
 	int i;
 
 	/* initialize the per cpu irq mask to all disabled */
-	for (i = 0; i < NR_CPUS; i++)
+	for (i = 0; i < nr_cpu_ids; i++)
 		vic_irq_mask[i] = 0xFFFF;
 
 	VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1d88d2b39771..9e5752fe4d15 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,7 +4,7 @@
 #include <linux/irq.h>
 #include <linux/dmi.h>
 #include <asm/numa.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 struct pci_root_info {
 	char *name;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 22e057665e55..9bb09823b362 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,7 +2,7 @@
 #include <linux/pci.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/pci-direct.h>
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index bb1a01f089e2..62ddb73e09ed 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -14,8 +14,7 @@
 #include <asm/segment.h>
 #include <asm/io.h>
 #include <asm/smp.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
 				PCI_PROBE_MMCONF;
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 9a5af6c8fbe9..bd13c3e4c6db 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -5,7 +5,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/dmi.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /*
  * Functions for accessing PCI base (first 256 bytes) and extended
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 86631ccbc25a..f6adf2c6d751 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -2,7 +2,7 @@
 #include <linux/pci.h>
 #include <asm/pci-direct.h>
 #include <asm/io.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* Direct PCI access. This is used for PCI accesses in early boot before
    the PCI subsystem works. */
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 2051dc96b8e9..7d388d5cf548 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,8 +6,7 @@
 #include <linux/dmi.h>
 #include <linux/pci.h>
 #include <linux/init.h>
-#include "pci.h"
-
+#include <asm/pci_x86.h>
 
 static void __devinit pci_fixup_i450nx(struct pci_dev *d)
 {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 844df0cbbd3e..e51bf2cda4b0 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -34,8 +34,8 @@
 
 #include <asm/pat.h>
 #include <asm/e820.h>
+#include <asm/pci_x86.h>
 
-#include "pci.h"
 
 static int
 skip_isa_ioresource_align(struct pci_dev *dev) {
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index d6c950f81858..bec3b048e72b 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,6 @@
 #include <linux/pci.h>
 #include <linux/init.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* arch_initcall has too random ordering, so call the initializers
    in the right sequence from here. */
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index bf69dbe08bff..373b9afe6d44 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -16,8 +16,7 @@
 #include <asm/io_apic.h>
 #include <linux/irq.h>
 #include <linux/acpi.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 #define PIRQ_SIGNATURE	(('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
 #define PIRQ_VERSION 0x0100
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index b722dd481b39..f1065b129e9c 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -3,7 +3,7 @@
  */
 #include <linux/init.h>
 #include <linux/pci.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /*
  * Discover remaining PCI buses in case there are peer host bridges.
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 654a2234f8f3..89bf9242c80a 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,8 +15,7 @@
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
 #include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* aperture is up to 256MB but BIOS may reserve less */
 #define MMCONFIG_APER_MIN	(2 * 1024*1024)
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f3c761dce695..8b2d561046a3 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <asm/e820.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* Assume systems with more busses have correct MCFG */
 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index a1994163c99d..30007ffc8e11 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -10,8 +10,7 @@
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
 #include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* Static virtual mapping of the MMCONFIG aperture */
 struct mmcfg_virt {
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 1177845d3186..2089354968a2 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -7,7 +7,7 @@
 #include <linux/nodemask.h>
 #include <mach_apic.h>
 #include <asm/mpspec.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 #define XQUAD_PORTIO_BASE 0xfe400000
 #define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index e11e9e803d5f..b889d824f7c6 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -29,7 +29,7 @@
 #include <linux/init.h>
 #include <asm/olpc.h>
 #include <asm/geode.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /*
  * In the tables below, the first two line (8 longwords) are the
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 37472fc6f729..b82cae970dfd 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -6,9 +6,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
-#include "pci.h"
-#include "pci-functions.h"
-
+#include <asm/pci_x86.h>
+#include <asm/mach-default/pci-functions.h>
 
 /* BIOS32 signature: "_32_" */
 #define BIOS32_SIGNATURE	(('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 42f4cb19faca..16d0c0eb0d19 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -9,11 +9,10 @@
 #include <linux/init.h>
 
 #include <asm/setup.h>
+#include <asm/pci_x86.h>
 #include <asm/visws/cobalt.h>
 #include <asm/visws/lithium.h>
 
-#include "pci.h"
-
 static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
 static void pci_visws_disable_irq(struct pci_dev *dev) { }
 
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 65d75a6be0ba..14f240623497 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -132,8 +132,7 @@ static void do_stolen_accounting(void)
 	*snap = state;
 
 	/* Add the appropriate number of ticks of stolen time,
-	   including any left-overs from last time.  Passing NULL to
-	   account_steal_time accounts the time as stolen. */
+	   including any left-overs from last time. */
 	stolen = runnable + offline + __get_cpu_var(residual_stolen);
 
 	if (stolen < 0)
@@ -141,11 +140,10 @@ static void do_stolen_accounting(void)
 
 	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 	__get_cpu_var(residual_stolen) = stolen;
-	account_steal_time(NULL, ticks);
+	account_steal_ticks(ticks);
 
 	/* Add the appropriate number of ticks of blocked time,
-	   including any left-overs from last time.  Passing idle to
-	   account_steal_time accounts the time as idle/wait. */
+	   including any left-overs from last time. */
 	blocked += __get_cpu_var(residual_blocked);
 
 	if (blocked < 0)
@@ -153,7 +151,7 @@ static void do_stolen_accounting(void)
 
 	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 	__get_cpu_var(residual_blocked) = blocked;
-	account_steal_time(idle_task(smp_processor_id()), ticks);
+	account_idle_ticks(ticks);
 }
 
 /*
author	Ingo Molnar <mingo@elte.hu>	2009-01-04 12:59:36 +0300
committer	Ingo Molnar <mingo@elte.hu>	2009-01-04 12:59:36 +0300
commit	4010b0192ddf6ec7ec1b9feb9b0953692aeb7329 (patch)
tree	188a36186f6ce580b479a9f90404fa7bfd8b22d7 /arch
parent	79ff56ebd3edfb16f8badc558cb439b203a3298f (diff)
parent	7d3b56ba37a95f1f370f50258ed3954c304c524b (diff)
download	linux-4010b0192ddf6ec7ec1b9feb9b0953692aeb7329.tar.xz