101 files changed, 5693 insertions, 640 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index b1b2b30b1b8e..abb582bc218f 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -18,6 +18,10 @@ config GENERIC_TIME
 	bool
 	default y
 
+config GENERIC_CMOS_UPDATE
+	bool
+	default y
+
 config CLOCKSOURCE_WATCHDOG
 	bool
 	default y
@@ -222,6 +226,8 @@ config PARAVIRT
 	  However, when run without a hypervisor the kernel is
 	  theoretically slower.  If in doubt, say N.
 
+source "arch/i386/xen/Kconfig"
+
 config VMI
 	bool "VMI Paravirt-ops support"
 	depends on PARAVIRT
@@ -542,6 +548,7 @@ config HIGHMEM4G
 config HIGHMEM64G
 	bool "64GB"
 	depends on !M386 && !M486
+	select X86_PAE
 	help
 	  Select this if you have a 32-bit processor and more than 4
 	  gigabytes of physical RAM.
@@ -571,12 +578,12 @@ choice
 	config VMSPLIT_3G
 		bool "3G/1G user/kernel split"
 	config VMSPLIT_3G_OPT
-		depends on !HIGHMEM
+		depends on !X86_PAE
 		bool "3G/1G user/kernel split (for full 1G low memory)"
 	config VMSPLIT_2G
 		bool "2G/2G user/kernel split"
 	config VMSPLIT_2G_OPT
-		depends on !HIGHMEM
+		depends on !X86_PAE
 		bool "2G/2G user/kernel split (for full 2G low memory)"
 	config VMSPLIT_1G
 		bool "1G/3G user/kernel split"
@@ -596,10 +603,15 @@ config HIGHMEM
 	default y
 
 config X86_PAE
-	bool
-	depends on HIGHMEM64G
-	default y
+	bool "PAE (Physical Address Extension) Support"
+	default n
+	depends on !HIGHMEM4G
 	select RESOURCES_64BIT
+	help
+	  PAE is required for NX support, and furthermore enables
+	  larger swapspace support for non-overcommit purposes. It
+	  has the cost of more pagetable lookup overhead, and also
+	  consumes more pagetable space per process.
 
 # Common NUMA Features
 config NUMA
@@ -815,6 +827,7 @@ config CRASH_DUMP
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+	default "0x1000000" if X86_NUMAQ
 	default "0x100000"
 	help
 	  This gives the physical address where the kernel is loaded.
@@ -1212,21 +1225,26 @@ source "drivers/Kconfig"
 
 source "fs/Kconfig"
 
-menu "Instrumentation Support"
+menuconfig INSTRUMENTATION
+	bool "Instrumentation Support"
 	depends on EXPERIMENTAL
+	default y
+
+if INSTRUMENTATION
 
 source "arch/i386/oprofile/Kconfig"
 
 config KPROBES
-	bool "Kprobes (EXPERIMENTAL)"
-	depends on KALLSYMS && EXPERIMENTAL && MODULES
+	bool "Kprobes"
+	depends on KALLSYMS && MODULES
 	help
 	  Kprobes allows you to trap at almost any kernel address and
 	  execute a callback function.  register_kprobe() establishes
 	  a probepoint and specifies the callback.  Kprobes is useful
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
-endmenu
+
+endif # INSTRUMENTATION
 
 source "arch/i386/Kconfig.debug"
 
diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu
index 9cbe76c3aa35..11a24d54f27b 100644
--- a/arch/i386/Kconfig.cpu
+++ b/arch/i386/Kconfig.cpu
@@ -297,11 +297,6 @@ config X86_POPAD_OK
 	depends on !M386
 	default y
 
-config X86_CMPXCHG64
-	bool
-	depends on X86_PAE
-	default y
-
 config X86_ALIGNMENT_16
 	bool
 	depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index bd28f9f9b4b7..01f0ff0daaf4 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000)	:= -Iinclude/asm-i386/mach-es7000
 mcore-$(CONFIG_X86_ES7000)	:= mach-default
 core-$(CONFIG_X86_ES7000)	:= arch/i386/mach-es7000/
 
+# Xen paravirtualization support
+core-$(CONFIG_XEN)		+= arch/i386/xen/
+
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
@@ -108,6 +111,7 @@ drivers-$(CONFIG_PCI)			+= arch/i386/pci/
 # must be linked after kernel/
 drivers-$(CONFIG_OPROFILE)		+= arch/i386/oprofile/
 drivers-$(CONFIG_PM)			+= arch/i386/power/
+drivers-$(CONFIG_FB)                    += arch/i386/video/
 
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
diff --git a/arch/i386/boot/.gitignore b/arch/i386/boot/.gitignore
index 495f20c085de..18465143cfa2 100644
--- a/arch/i386/boot/.gitignore
+++ b/arch/i386/boot/.gitignore
@@ -1,3 +1,5 @@
 bootsect
 bzImage
 setup
+setup.bin
+setup.elf
diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile
index 08678a0a3d19..93386a4e40b4 100644
--- a/arch/i386/boot/Makefile
+++ b/arch/i386/boot/Makefile
@@ -39,7 +39,7 @@ setup-y		+= printf.o string.o tty.o video.o version.o voyager.o
 setup-y		+= video-vga.o
 setup-y		+= video-vesa.o
 setup-y		+= video-bios.o
-
+targets		+= $(setup-y)
 hostprogs-y	:= tools/build
 
 HOSTCFLAGS_build.o := $(LINUXINCLUDE)
diff --git a/arch/i386/boot/boot.h b/arch/i386/boot/boot.h
index 0329c4fe4f88..dec70c9b6050 100644
--- a/arch/i386/boot/boot.h
+++ b/arch/i386/boot/boot.h
@@ -56,7 +56,7 @@ static inline u16 inw(u16 port)
 
 static inline void outl(u32 v, u16 port)
 {
-	asm volatile("outl %0,%1" : : "a" (v), "dn" (port));
+	asm volatile("outl %0,%1" : : "a" (v), "dN" (port));
 }
 static inline u32 inl(u32 port)
 {
diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c
index ce4fda261aaf..2d77ee728f92 100644
--- a/arch/i386/boot/compressed/relocs.c
+++ b/arch/i386/boot/compressed/relocs.c
@@ -31,6 +31,9 @@ static const char* safe_abs_relocs[] = {
 		"__kernel_rt_sigreturn",
 		"__kernel_sigreturn",
 		"SYSENTER_RETURN",
+		"VDSO_NOTE_MASK",
+		"xen_irq_disable_direct_reloc",
+		"xen_save_fl_direct_reloc",
 };
 
 static int is_safe_abs_reloc(const char* sym_name)
diff --git a/arch/i386/boot/cpucheck.c b/arch/i386/boot/cpucheck.c
index 8b0f4473b083..991e8ceae1de 100644
--- a/arch/i386/boot/cpucheck.c
+++ b/arch/i386/boot/cpucheck.c
@@ -115,8 +115,8 @@ static int has_eflag(u32 mask)
 	    "pushfl ; "
 	    "popl %1 ; "
 	    "popfl"
-	    : "=r" (f0), "=r" (f1)
-	    : "g" (mask));
+	    : "=&r" (f0), "=&r" (f1)
+	    : "ri" (mask));
 
 	return !!((f0^f1) & mask);
 }
diff --git a/arch/i386/boot/mca.c b/arch/i386/boot/mca.c
index 9b68bd1aef19..68222f2d4b67 100644
--- a/arch/i386/boot/mca.c
+++ b/arch/i386/boot/mca.c
@@ -26,7 +26,7 @@ int query_mca(void)
 	    "setc %0 ; "
 	    "movw %%es, %1 ; "
 	    "popw %%es"
-	    : "=acdSDm" (err), "=acdSDm" (es), "=b" (bx)
+	    : "=acd" (err), "=acdSD" (es), "=b" (bx)
 	    : "a" (0xc000));
 
 	if (err)
diff --git a/arch/i386/boot/pm.c b/arch/i386/boot/pm.c
index 3fa53e15ed77..1df025c73261 100644
--- a/arch/i386/boot/pm.c
+++ b/arch/i386/boot/pm.c
@@ -65,7 +65,7 @@ static void move_kernel_around(void)
 			     "popw %%ds ; "
 			     "popw %%es"
 			     : "+c" (dwords)
-			     : "rm" (dst_seg), "rm" (src_seg)
+			     : "r" (dst_seg), "r" (src_seg)
 			     : "esi", "edi");
 
 		syssize -= paras;
diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c
index 886f47d8a488..b4248740ff0d 100644
--- a/arch/i386/boot/tools/build.c
+++ b/arch/i386/boot/tools/build.c
@@ -5,7 +5,7 @@
  */
 
 /*
- * This file builds a disk-image from three different files:
+ * This file builds a disk-image from two different files:
  *
  * - setup: 8086 machine code, sets up system parm
  * - system: 80386 code for actual system
diff --git a/arch/i386/boot/tty.c b/arch/i386/boot/tty.c
index a8db78736b02..9c668aad3515 100644
--- a/arch/i386/boot/tty.c
+++ b/arch/i386/boot/tty.c
@@ -31,7 +31,7 @@ void __attribute__((section(".inittext"))) putchar(int ch)
 
 	/* int $0x10 is known to have bugs involving touching registers
 	   it shouldn't.  Be extra conservative... */
-	asm volatile("pushal; int $0x10; popal"
+	asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
 		     : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
 }
 
diff --git a/arch/i386/boot/video.c b/arch/i386/boot/video.c
index 3bb3573cd6a1..958130ef0042 100644
--- a/arch/i386/boot/video.c
+++ b/arch/i386/boot/video.c
@@ -195,7 +195,7 @@ static void vga_recalc_vertical(void)
 {
 	unsigned int font_size, rows;
 	u16 crtc;
-	u8 ov;
+	u8 pt, ov;
 
 	set_fs(0);
 	font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
@@ -206,7 +206,12 @@ static void vga_recalc_vertical(void)
 
 	crtc = vga_crtc();
 
+	pt = in_idx(crtc, 0x11);
+	pt &= ~0x80;		/* Unlock CR0-7 */
+	out_idx(pt, crtc, 0x11);
+
 	out_idx((u8)rows, crtc, 0x12); /* Lower height register */
+
 	ov = in_idx(crtc, 0x07); /* Overflow register */
 	ov &= 0xbd;
 	ov |= (rows >> (8-1)) & 0x02;
@@ -411,7 +416,7 @@ static void restore_screen(void)
 			     "1: rep;stosl ; "
 			     "popw %%es"
 			     : "+D" (dst), "+c" (npad)
-			     : "bdSm" (video_segment),
+			     : "bdS" (video_segment),
 			       "a" (0x07200720));
 	}
 
diff --git a/arch/i386/boot/video.h b/arch/i386/boot/video.h
index 29eca1710b2c..b92447d51213 100644
--- a/arch/i386/boot/video.h
+++ b/arch/i386/boot/video.h
@@ -117,8 +117,15 @@ extern int graphic_mode;	/* Graphics mode with linear frame buffer */
  * int $0x10 is notorious for touching registers it shouldn't.
  * gcc doesn't like %ebp being clobbered, so define it as a push/pop
  * sequence here.
+ *
+ * A number of systems, including the original PC can clobber %bp in
+ * certain circumstances, like when scrolling.  There exists at least
+ * one Trident video card which could clobber DS under a set of
+ * circumstances that we are unlikely to encounter (scrolling when
+ * using an extended graphics mode of more than 800x600 pixels), but
+ * it's cheap insurance to deal with that here.
  */
-#define INT10 "pushl %%ebp; int $0x10; popl %%ebp"
+#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
 
 /* Accessing VGA indexed registers */
 static inline u8 in_idx(u16 port, u8 index)
diff --git a/arch/i386/boot/voyager.c b/arch/i386/boot/voyager.c
index 9221614d0db8..61c8fe0453be 100644
--- a/arch/i386/boot/voyager.c
+++ b/arch/i386/boot/voyager.c
@@ -32,7 +32,7 @@ int query_voyager(void)
 	    "setc %0 ; "
 	    "movw %%es, %1 ; "
 	    "popw %%es"
-	    : "=qm" (err), "=rm" (es), "=D" (di)
+	    : "=q" (err), "=r" (es), "=D" (di)
 	    : "a" (0xffc0));
 
 	if (err)
diff --git a/arch/i386/defconfig b/arch/i386/defconfig
index 1a3a2217b7c2..54ee1764fdae 100644
--- a/arch/i386/defconfig
+++ b/arch/i386/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.22-rc2
-# Mon May 21 13:23:44 2007
+# Linux kernel version: 2.6.22-git14
+# Fri Jul 20 09:53:15 2007
 #
 CONFIG_X86_32=y
 CONFIG_GENERIC_TIME=y
@@ -37,19 +37,18 @@ CONFIG_LOCALVERSION=""
 CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
-# CONFIG_IPC_NS is not set
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
-# CONFIG_UTS_NS is not set
+# CONFIG_USER_NS is not set
 # CONFIG_AUDIT is not set
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=18
 # CONFIG_CPUSETS is not set
 CONFIG_SYSFS_DEPRECATED=y
-# CONFIG_RELAY is not set
+CONFIG_RELAY=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -73,16 +72,13 @@ CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
 CONFIG_VM_EVENT_COUNTERS=y
-CONFIG_SLAB=y
-# CONFIG_SLUB is not set
+CONFIG_SLUB_DEBUG=y
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 CONFIG_RT_MUTEXES=y
 # CONFIG_TINY_SHMEM is not set
 CONFIG_BASE_SMALL=0
-
-#
-# Loadable module support
-#
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
@@ -90,14 +86,11 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 # CONFIG_KMOD is not set
 CONFIG_STOP_MACHINE=y
-
-#
-# Block layer
-#
 CONFIG_BLOCK=y
 CONFIG_LBD=y
 # CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_LSF is not set
+# CONFIG_BLK_DEV_BSG is not set
 
 #
 # IO Schedulers
@@ -166,7 +159,6 @@ CONFIG_X86_WP_WORKS_OK=y
 CONFIG_X86_INVLPG=y
 CONFIG_X86_BSWAP=y
 CONFIG_X86_POPAD_OK=y
-CONFIG_X86_CMPXCHG64=y
 CONFIG_X86_GOOD_APIC=y
 CONFIG_X86_INTEL_USERCOPY=y
 CONFIG_X86_USE_PPRO_CHECKSUM=y
@@ -202,6 +194,7 @@ CONFIG_X86_CPUID=y
 # CONFIG_EDD is not set
 # CONFIG_DELL_RBU is not set
 # CONFIG_DCDBAS is not set
+CONFIG_DMIID=y
 # CONFIG_NOHIGHMEM is not set
 CONFIG_HIGHMEM4G=y
 # CONFIG_HIGHMEM64G is not set
@@ -218,7 +211,9 @@ CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 CONFIG_RESOURCES_64BIT=y
 CONFIG_ZONE_DMA_FLAG=1
+CONFIG_BOUNCE=y
 CONFIG_NR_QUICK=1
+CONFIG_VIRT_TO_BUS=y
 # CONFIG_HIGHPTE is not set
 # CONFIG_MATH_EMULATION is not set
 CONFIG_MTRR=y
@@ -245,7 +240,6 @@ CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
 CONFIG_PM=y
 CONFIG_PM_LEGACY=y
 # CONFIG_PM_DEBUG is not set
-# CONFIG_PM_SYSFS_DEPRECATED is not set
 
 #
 # ACPI (Advanced Configuration and Power Interface) Support
@@ -285,7 +279,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_GOV_ONDEMAND=y
-# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
 
 #
 # CPUFreq processor drivers
@@ -326,7 +320,7 @@ CONFIG_PCI_MMCONFIG=y
 CONFIG_ARCH_SUPPORTS_MSI=y
 CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
-CONFIG_HT_IRQ=y
+# CONFIG_HT_IRQ is not set
 CONFIG_ISA_DMA_API=y
 # CONFIG_ISA is not set
 # CONFIG_MCA is not set
@@ -382,7 +376,7 @@ CONFIG_IP_PNP_DHCP=y
 CONFIG_INET_TUNNEL=y
 CONFIG_INET_XFRM_MODE_TRANSPORT=y
 CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_INET_XFRM_MODE_BEET=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
 CONFIG_INET_DIAG=y
 CONFIG_INET_TCP_DIAG=y
 # CONFIG_TCP_CONG_ADVANCED is not set
@@ -401,27 +395,15 @@ CONFIG_IPV6=y
 # CONFIG_INET6_TUNNEL is not set
 CONFIG_INET6_XFRM_MODE_TRANSPORT=y
 CONFIG_INET6_XFRM_MODE_TUNNEL=y
-CONFIG_INET6_XFRM_MODE_BEET=y
+# CONFIG_INET6_XFRM_MODE_BEET is not set
 # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
 CONFIG_IPV6_SIT=y
 # CONFIG_IPV6_TUNNEL is not set
 # CONFIG_IPV6_MULTIPLE_TABLES is not set
 # CONFIG_NETWORK_SECMARK is not set
 # CONFIG_NETFILTER is not set
-
-#
-# DCCP Configuration (EXPERIMENTAL)
-#
 # CONFIG_IP_DCCP is not set
-
-#
-# SCTP Configuration (EXPERIMENTAL)
-#
 # CONFIG_IP_SCTP is not set
-
-#
-# TIPC Configuration (EXPERIMENTAL)
-#
 # CONFIG_TIPC is not set
 # CONFIG_ATM is not set
 # CONFIG_BRIDGE is not set
@@ -458,6 +440,7 @@ CONFIG_IPV6_SIT=y
 # CONFIG_MAC80211 is not set
 # CONFIG_IEEE80211 is not set
 # CONFIG_RFKILL is not set
+# CONFIG_NET_9P is not set
 
 #
 # Device Drivers
@@ -472,21 +455,9 @@ CONFIG_FW_LOADER=y
 # CONFIG_DEBUG_DRIVER is not set
 # CONFIG_DEBUG_DEVRES is not set
 # CONFIG_SYS_HYPERVISOR is not set
-
-#
-# Connector - unified userspace <-> kernelspace linker
-#
 # CONFIG_CONNECTOR is not set
 # CONFIG_MTD is not set
-
-#
-# Parallel port support
-#
 # CONFIG_PARPORT is not set
-
-#
-# Plug and Play support
-#
 CONFIG_PNP=y
 # CONFIG_PNP_DEBUG is not set
 
@@ -494,10 +465,7 @@ CONFIG_PNP=y
 # Protocols
 #
 CONFIG_PNPACPI=y
-
-#
-# Block devices
-#
+CONFIG_BLK_DEV=y
 CONFIG_BLK_DEV_FD=y
 # CONFIG_BLK_CPQ_DA is not set
 # CONFIG_BLK_CPQ_CISS_DA is not set
@@ -515,17 +483,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096
 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
 # CONFIG_CDROM_PKTCDVD is not set
 # CONFIG_ATA_OVER_ETH is not set
-
-#
-# Misc devices
-#
+CONFIG_MISC_DEVICES=y
 # CONFIG_IBM_ASM is not set
 # CONFIG_PHANTOM is not set
+# CONFIG_EEPROM_93CX6 is not set
 # CONFIG_SGI_IOC4 is not set
 # CONFIG_TIFM_CORE is not set
 # CONFIG_SONY_LAPTOP is not set
 # CONFIG_THINKPAD_ACPI is not set
-# CONFIG_BLINK is not set
 CONFIG_IDE=y
 CONFIG_BLK_DEV_IDE=y
 
@@ -597,6 +562,7 @@ CONFIG_BLK_DEV_IDEDMA=y
 #
 # CONFIG_RAID_ATTRS is not set
 CONFIG_SCSI=y
+CONFIG_SCSI_DMA=y
 # CONFIG_SCSI_TGT is not set
 CONFIG_SCSI_NETLINK=y
 # CONFIG_SCSI_PROC_FS is not set
@@ -607,8 +573,9 @@ CONFIG_SCSI_NETLINK=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 # CONFIG_CHR_DEV_OSST is not set
-# CONFIG_BLK_DEV_SR is not set
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_BLK_DEV_SR=y
+# CONFIG_BLK_DEV_SR_VENDOR is not set
+CONFIG_CHR_DEV_SG=y
 # CONFIG_CHR_DEV_SCH is not set
 
 #
@@ -668,6 +635,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0
 # CONFIG_SCSI_INIA100 is not set
 # CONFIG_SCSI_STEX is not set
 # CONFIG_SCSI_SYM53C8XX_2 is not set
+# CONFIG_SCSI_IPR is not set
 # CONFIG_SCSI_QLOGIC_1280 is not set
 # CONFIG_SCSI_QLA_FC is not set
 # CONFIG_SCSI_QLA_ISCSI is not set
@@ -676,14 +644,73 @@ CONFIG_AIC79XX_DEBUG_MASK=0
 # CONFIG_SCSI_DC390T is not set
 # CONFIG_SCSI_NSP32 is not set
 # CONFIG_SCSI_DEBUG is not set
-# CONFIG_SCSI_ESP_CORE is not set
 # CONFIG_SCSI_SRP is not set
-# CONFIG_ATA is not set
-
-#
-# Multi-device support (RAID and LVM)
-#
-# CONFIG_MD is not set
+CONFIG_ATA=y
+# CONFIG_ATA_NONSTANDARD is not set
+CONFIG_ATA_ACPI=y
+CONFIG_SATA_AHCI=y
+CONFIG_SATA_SVW=y
+CONFIG_ATA_PIIX=y
+# CONFIG_SATA_MV is not set
+CONFIG_SATA_NV=y
+# CONFIG_PDC_ADMA is not set
+# CONFIG_SATA_QSTOR is not set
+# CONFIG_SATA_PROMISE is not set
+# CONFIG_SATA_SX4 is not set
+CONFIG_SATA_SIL=y
+# CONFIG_SATA_SIL24 is not set
+# CONFIG_SATA_SIS is not set
+# CONFIG_SATA_ULI is not set
+CONFIG_SATA_VIA=y
+# CONFIG_SATA_VITESSE is not set
+# CONFIG_SATA_INIC162X is not set
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD640_PCI is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CS5535 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_IT8213 is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MARVELL is not set
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+# CONFIG_PATA_PDC2027X is not set
+# CONFIG_PATA_SIL680 is not set
+# CONFIG_PATA_SIS is not set
+# CONFIG_PATA_VIA is not set
+# CONFIG_PATA_WINBOND is not set
+CONFIG_MD=y
+# CONFIG_BLK_DEV_MD is not set
+CONFIG_BLK_DEV_DM=y
+# CONFIG_DM_DEBUG is not set
+# CONFIG_DM_CRYPT is not set
+# CONFIG_DM_SNAPSHOT is not set
+# CONFIG_DM_MIRROR is not set
+# CONFIG_DM_ZERO is not set
+# CONFIG_DM_MULTIPATH is not set
+# CONFIG_DM_DELAY is not set
 
 #
 # Fusion MPT device support
@@ -724,42 +751,27 @@ CONFIG_IEEE1394_OHCI1394=y
 # CONFIG_IEEE1394_ETH1394 is not set
 # CONFIG_IEEE1394_DV1394 is not set
 CONFIG_IEEE1394_RAWIO=y
-
-#
-# I2O device support
-#
 # CONFIG_I2O is not set
-# CONFIG_MACINTOSH_DRIVERS is not set
-
-#
-# Network device support
-#
+CONFIG_MACINTOSH_DRIVERS=y
+# CONFIG_MAC_EMUMOUSEBTN is not set
 CONFIG_NETDEVICES=y
+CONFIG_NETDEVICES_MULTIQUEUE=y
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
+# CONFIG_MACVLAN is not set
 # CONFIG_EQUALIZER is not set
 # CONFIG_TUN is not set
 # CONFIG_NET_SB1000 is not set
-
-#
-# ARCnet devices
-#
 # CONFIG_ARCNET is not set
 # CONFIG_PHYLIB is not set
-
-#
-# Ethernet (10 or 100Mbit)
-#
 CONFIG_NET_ETHERNET=y
 CONFIG_MII=y
 # CONFIG_HAPPYMEAL is not set
 # CONFIG_SUNGEM is not set
 # CONFIG_CASSINI is not set
-# CONFIG_NET_VENDOR_3COM is not set
-
-#
-# Tulip family network device support
-#
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_VORTEX=y
+# CONFIG_TYPHOON is not set
 CONFIG_NET_TULIP=y
 # CONFIG_DE2104X is not set
 CONFIG_TULIP=y
@@ -810,7 +822,6 @@ CONFIG_R8169=y
 # CONFIG_SIS190 is not set
 # CONFIG_SKGE is not set
 CONFIG_SKY2=y
-# CONFIG_SK98LIN is not set
 # CONFIG_VIA_VELOCITY is not set
 CONFIG_TIGON3=y
 CONFIG_BNX2=y
@@ -824,10 +835,6 @@ CONFIG_NETDEV_10000=y
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_MLX4_CORE is not set
-
-#
-# Token Ring devices
-#
 # CONFIG_TR is not set
 
 #
@@ -856,15 +863,7 @@ CONFIG_NETCONSOLE=y
 CONFIG_NETPOLL=y
 # CONFIG_NETPOLL_TRAP is not set
 CONFIG_NET_POLL_CONTROLLER=y
-
-#
-# ISDN subsystem
-#
 # CONFIG_ISDN is not set
-
-#
-# Telephony Support
-#
 # CONFIG_PHONE is not set
 
 #
@@ -872,6 +871,7 @@ CONFIG_NET_POLL_CONTROLLER=y
 #
 CONFIG_INPUT=y
 # CONFIG_INPUT_FF_MEMLESS is not set
+# CONFIG_INPUT_POLLDEV is not set
 
 #
 # Userland interfaces
@@ -937,6 +937,7 @@ CONFIG_HW_CONSOLE=y
 #
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_FIX_EARLYCON_MEM=y
 CONFIG_SERIAL_8250_PCI=y
 CONFIG_SERIAL_8250_PNP=y
 CONFIG_SERIAL_8250_NR_UARTS=4
@@ -952,10 +953,6 @@ CONFIG_SERIAL_CORE_CONSOLE=y
 CONFIG_UNIX98_PTYS=y
 CONFIG_LEGACY_PTYS=y
 CONFIG_LEGACY_PTY_COUNT=256
-
-#
-# IPMI
-#
 # CONFIG_IPMI_HANDLER is not set
 # CONFIG_WATCHDOG is not set
 CONFIG_HW_RANDOM=y
@@ -989,11 +986,7 @@ CONFIG_MAX_RAW_DEVS=256
 CONFIG_HPET=y
 # CONFIG_HPET_RTC_IRQ is not set
 CONFIG_HPET_MMAP=y
-CONFIG_HANGCHECK_TIMER=y
-
-#
-# TPM devices
-#
+# CONFIG_HANGCHECK_TIMER is not set
 # CONFIG_TCG_TPM is not set
 # CONFIG_TELCLOCK is not set
 CONFIG_DEVPORT=y
@@ -1004,11 +997,8 @@ CONFIG_DEVPORT=y
 #
 # CONFIG_SPI is not set
 # CONFIG_SPI_MASTER is not set
-
-#
-# Dallas's 1-wire bus
-#
 # CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
 # CONFIG_HWMON is not set
 
 #
@@ -1042,7 +1032,7 @@ CONFIG_DAB=y
 CONFIG_VGA_CONSOLE=y
 CONFIG_VGACON_SOFT_SCROLLBACK=y
 CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128
-# CONFIG_VIDEO_SELECT is not set
+CONFIG_VIDEO_SELECT=y
 CONFIG_DUMMY_CONSOLE=y
 
 #
@@ -1059,15 +1049,11 @@ CONFIG_SOUND=y
 # Open Sound System
 #
 CONFIG_SOUND_PRIME=y
-# CONFIG_OSS_OBSOLETE is not set
 # CONFIG_SOUND_TRIDENT is not set
 # CONFIG_SOUND_MSNDCLAS is not set
 # CONFIG_SOUND_MSNDPIN is not set
 # CONFIG_SOUND_OSS is not set
-
-#
-# HID Devices
-#
+CONFIG_HID_SUPPORT=y
 CONFIG_HID=y
 # CONFIG_HID_DEBUG is not set
 
@@ -1078,10 +1064,7 @@ CONFIG_USB_HID=y
 # CONFIG_USB_HIDINPUT_POWERBOOK is not set
 # CONFIG_HID_FF is not set
 # CONFIG_USB_HIDDEV is not set
-
-#
-# USB support
-#
+CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
 CONFIG_USB_ARCH_HAS_EHCI=y
@@ -1095,6 +1078,7 @@ CONFIG_USB_DEVICEFS=y
 # CONFIG_USB_DEVICE_CLASS is not set
 # CONFIG_USB_DYNAMIC_MINORS is not set
 # CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_PERSIST is not set
 # CONFIG_USB_OTG is not set
 
 #
@@ -1104,7 +1088,6 @@ CONFIG_USB_EHCI_HCD=y
 # CONFIG_USB_EHCI_SPLIT_ISO is not set
 # CONFIG_USB_EHCI_ROOT_HUB_TT is not set
 # CONFIG_USB_EHCI_TT_NEWSCHED is not set
-# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set
 # CONFIG_USB_ISP116X_HCD is not set
 CONFIG_USB_OHCI_HCD=y
 # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set
@@ -1112,6 +1095,7 @@ CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_OHCI_LITTLE_ENDIAN=y
 CONFIG_USB_UHCI_HCD=y
 # CONFIG_USB_SL811_HCD is not set
+# CONFIG_USB_R8A66597_HCD is not set
 
 #
 # USB Device Class drivers
@@ -1202,15 +1186,7 @@ CONFIG_USB_MON=y
 #
 # LED Triggers
 #
-
-#
-# InfiniBand support
-#
 # CONFIG_INFINIBAND is not set
-
-#
-# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
-#
 # CONFIG_EDAC is not set
 
 #
@@ -1230,11 +1206,13 @@ CONFIG_USB_MON=y
 #
 # DMA Devices
 #
+CONFIG_VIRTUALIZATION=y
+# CONFIG_KVM is not set
 
 #
-# Virtualization
+# Userspace I/O
 #
-# CONFIG_KVM is not set
+# CONFIG_UIO is not set
 
 #
 # File systems
@@ -1272,6 +1250,7 @@ CONFIG_DNOTIFY=y
 # CONFIG_AUTOFS_FS is not set
 CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
+CONFIG_GENERIC_ACL=y
 
 #
 # CD-ROM/DVD Filesystems
@@ -1299,7 +1278,7 @@ CONFIG_PROC_KCORE=y
 CONFIG_PROC_SYSCTL=y
 CONFIG_SYSFS=y
 CONFIG_TMPFS=y
-# CONFIG_TMPFS_POSIX_ACL is not set
+CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_HUGETLBFS=y
 CONFIG_HUGETLB_PAGE=y
 CONFIG_RAMFS=y
@@ -1349,7 +1328,6 @@ CONFIG_SUNRPC=y
 # CONFIG_NCP_FS is not set
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
-# CONFIG_9P_FS is not set
 
 #
 # Partition Types
@@ -1405,10 +1383,7 @@ CONFIG_NLS_UTF8=y
 # Distributed Lock Manager
 #
 # CONFIG_DLM is not set
-
-#
-# Instrumentation Support
-#
+CONFIG_INSTRUMENTATION=y
 CONFIG_PROFILING=y
 CONFIG_OPROFILE=y
 CONFIG_KPROBES=y
@@ -1418,7 +1393,7 @@ CONFIG_KPROBES=y
 #
 CONFIG_TRACE_IRQFLAGS_SUPPORT=y
 # CONFIG_PRINTK_TIME is not set
-CONFIG_ENABLE_MUST_CHECK=y
+# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_UNUSED_SYMBOLS=y
 # CONFIG_DEBUG_FS is not set
@@ -1426,15 +1401,17 @@ CONFIG_UNUSED_SYMBOLS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 CONFIG_DETECT_SOFTLOCKUP=y
+# CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
-# CONFIG_TIMER_STATS is not set
-# CONFIG_DEBUG_SLAB is not set
+CONFIG_TIMER_STATS=y
+# CONFIG_SLUB_DEBUG_ON is not set
 # CONFIG_DEBUG_RT_MUTEXES is not set
 # CONFIG_RT_MUTEX_TESTER is not set
 # CONFIG_DEBUG_SPINLOCK is not set
 # CONFIG_DEBUG_MUTEXES is not set
 # CONFIG_DEBUG_LOCK_ALLOC is not set
 # CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
 # CONFIG_DEBUG_KOBJECT is not set
@@ -1444,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_VM is not set
 # CONFIG_DEBUG_LIST is not set
 # CONFIG_FRAME_POINTER is not set
-# CONFIG_UNWIND_INFO is not set
 # CONFIG_FORCED_INLINING is not set
 # CONFIG_RCU_TORTURE_TEST is not set
 # CONFIG_LKDTM is not set
@@ -1463,10 +1439,6 @@ CONFIG_DOUBLEFAULT=y
 #
 # CONFIG_KEYS is not set
 # CONFIG_SECURITY is not set
-
-#
-# Cryptographic options
-#
 # CONFIG_CRYPTO is not set
 
 #
@@ -1477,6 +1449,7 @@ CONFIG_BITREVERSE=y
 # CONFIG_CRC16 is not set
 # CONFIG_CRC_ITU_T is not set
 CONFIG_CRC32=y
+# CONFIG_CRC7 is not set
 # CONFIG_LIBCRC32C is not set
 CONFIG_ZLIB_INFLATE=y
 CONFIG_PLIST=y
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 06da59f6f837..dbe5e87e0d66 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_MGEODE_LX)		+= geode.o
 
 obj-$(CONFIG_VMI)		+= vmi.o vmiclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index a574cd2c8b61..cacdd883bf2b 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -618,6 +618,8 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
 #ifdef CONFIG_HPET_TIMER
 #include <asm/hpet.h>
 
+static struct __initdata resource *hpet_res;
+
 static int __init acpi_parse_hpet(struct acpi_table_header *table)
 {
 	struct acpi_table_hpet *hpet_tbl;
@@ -638,8 +640,42 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
 	       hpet_tbl->id, hpet_address);
 
+	/*
+	 * Allocate and initialize the HPET firmware resource for adding into
+	 * the resource tree during the lateinit timeframe.
+	 */
+#define HPET_RESOURCE_NAME_SIZE 9
+	hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+
+	if (!hpet_res)
+		return 0;
+
+	memset(hpet_res, 0, sizeof(*hpet_res));
+	hpet_res->name = (void *)&hpet_res[1];
+	hpet_res->flags = IORESOURCE_MEM;
+	snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
+		 hpet_tbl->sequence);
+
+	hpet_res->start = hpet_address;
+	hpet_res->end = hpet_address + (1 * 1024) - 1;
+
 	return 0;
 }
+
+/*
+ * hpet_insert_resource inserts the HPET resources used into the resource
+ * tree.
+ */
+static __init int hpet_insert_resource(void)
+{
+	if (!hpet_res)
+		return 1;
+
+	return insert_resource(&iomem_resource, hpet_res);
+}
+
+late_initcall(hpet_insert_resource);
+
 #else
 #define	acpi_parse_hpet	NULL
 #endif
@@ -950,14 +986,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 	 },
 	{
 	 .callback = force_acpi_ht,
-	 .ident = "DELL GX240",
-	 .matches = {
-		     DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"),
-		     DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"),
-		     },
-	 },
-	{
-	 .callback = force_acpi_ht,
 	 .ident = "HP VISUALIZE NT Workstation",
 	 .matches = {
 		     DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c
index 4ee83577bf61..c42b5ab49deb 100644
--- a/arch/i386/kernel/acpi/sleep.c
+++ b/arch/i386/kernel/acpi/sleep.c
@@ -14,7 +14,7 @@
 
 /* address in low memory of the wakeup routine. */
 unsigned long acpi_wakeup_address = 0;
-unsigned long acpi_video_flags;
+unsigned long acpi_realmode_flags;
 extern char wakeup_start, wakeup_end;
 
 extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
@@ -68,9 +68,11 @@ static int __init acpi_sleep_setup(char *str)
 {
 	while ((str != NULL) && (*str != '\0')) {
 		if (strncmp(str, "s3_bios", 7) == 0)
-			acpi_video_flags = 1;
+			acpi_realmode_flags |= 1;
 		if (strncmp(str, "s3_mode", 7) == 0)
-			acpi_video_flags |= 2;
+			acpi_realmode_flags |= 2;
+		if (strncmp(str, "s3_beep", 7) == 0)
+			acpi_realmode_flags |= 4;
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
@@ -80,9 +82,11 @@ static int __init acpi_sleep_setup(char *str)
 
 __setup("acpi_sleep=", acpi_sleep_setup);
 
+/* Ouch, we want to delete this. We already have better version in userspace, in
+   s2ram from suspend.sf.net project */
 static __init int reset_videomode_after_s3(struct dmi_system_id *d)
 {
-	acpi_video_flags |= 2;
+	acpi_realmode_flags |= 2;
 	return 0;
 }
 
diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
index a2295a34b2c7..ed0a0f2c1597 100644
--- a/arch/i386/kernel/acpi/wakeup.S
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -13,6 +13,21 @@
 # cs = 0x1234, eip = 0x05
 # 
 
+#define BEEP \
+	inb	$97, %al; 	\
+	outb	%al, $0x80; 	\
+	movb	$3, %al; 	\
+	outb	%al, $97; 	\
+	outb	%al, $0x80; 	\
+	movb	$-74, %al; 	\
+	outb	%al, $67; 	\
+	outb	%al, $0x80; 	\
+	movb	$-119, %al; 	\
+	outb	%al, $66; 	\
+	outb	%al, $0x80; 	\
+	movb	$15, %al; 	\
+	outb	%al, $66;
+
 ALIGN
 	.align	4096
 ENTRY(wakeup_start)
@@ -31,6 +46,11 @@ wakeup_code:
 	movw	%cs, %ax
 	movw	%ax, %ds					# Make ds:0 point to wakeup_start
 	movw	%ax, %ss
+
+	testl   $4, realmode_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
 	mov	$(wakeup_stack - wakeup_code), %sp		# Private stack is needed for ASUS board
 	movw	$0x0e00 + 'S', %fs:(0x12)
 
@@ -41,7 +61,7 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
-	testl	$1, video_flags - wakeup_code
+	testl	$1, realmode_flags - wakeup_code
 	jz	1f
 	lcall   $0xc000,$3
 	movw	%cs, %ax
@@ -49,7 +69,7 @@ wakeup_code:
 	movw	%ax, %ss
 1:
 
-	testl	$2, video_flags - wakeup_code
+	testl	$2, realmode_flags - wakeup_code
 	jz	1f
 	mov	video_mode - wakeup_code, %ax
 	call	mode_set
@@ -88,7 +108,11 @@ wakeup_code:
 	cmpl	$0x12345678, %eax
 	jne	bogus_real_magic
 
-	ljmpl	$__KERNEL_CS,$wakeup_pmode_return
+	testl   $8, realmode_flags - wakeup_code
+	jz      1f
+	BEEP
+1:
+	ljmpl	$__KERNEL_CS, $wakeup_pmode_return
 
 real_save_gdt:	.word 0
 		.long 0
@@ -97,7 +121,8 @@ real_save_cr3:	.long 0
 real_save_cr4:	.long 0
 real_magic:	.long 0
 video_mode:	.long 0
-video_flags:	.long 0
+realmode_flags:	.long 0
+beep_flags:	.long 0
 real_efer_save_restore:	.long 0
 real_save_efer_edx: 	.long 0
 real_save_efer_eax: 	.long 0
@@ -260,8 +285,8 @@ ENTRY(acpi_copy_wakeup_routine)
 
 	movl	saved_videomode, %edx
 	movl	%edx, video_mode - wakeup_start (%eax)
-	movl	acpi_video_flags, %edx
-	movl	%edx, video_flags - wakeup_start (%eax)
+	movl	acpi_realmode_flags, %edx
+	movl	%edx, realmode_flags - wakeup_start (%eax)
 	movl	$0x12345678, real_magic - wakeup_start (%eax)
 	movl	$0x12345678, saved_magic
 	popl	%ebx
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index d8cda14fff8b..c3750c2c4113 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -2,12 +2,17 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
 
-static int noreplace_smp     = 0;
-static int smp_alt_once      = 0;
-static int debug_alternative = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+static int smp_alt_once;
 
 static int __init bootonly(char *str)
 {
@@ -15,6 +20,11 @@ static int __init bootonly(char *str)
 	return 1;
 }
 __setup("smp-alt-boot", bootonly);
+#else
+#define smp_alt_once 1
+#endif
+
+static int debug_alternative;
 
 static int __init debug_alt(char *str)
 {
@@ -23,6 +33,8 @@ static int __init debug_alt(char *str)
 }
 __setup("debug-alternative", debug_alt);
 
+static int noreplace_smp;
+
 static int __init setup_noreplace_smp(char *str)
 {
 	noreplace_smp = 1;
@@ -144,7 +156,7 @@ static void nop_out(void *insns, unsigned int len)
 		unsigned int noplen = len;
 		if (noplen > ASM_NOP_MAX)
 			noplen = ASM_NOP_MAX;
-		memcpy(insns, noptable[noplen], noplen);
+		text_poke(insns, noptable[noplen], noplen);
 		insns += noplen;
 		len -= noplen;
 	}
@@ -196,7 +208,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 			continue;
 		if (*ptr > text_end)
 			continue;
-		**ptr = 0xf0; /* lock prefix */
+		text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
 	};
 }
 
@@ -354,10 +366,6 @@ void apply_paravirt(struct paravirt_patch_site *start,
 		/* Pad the rest with nops */
 		nop_out(p->instr + used, p->len - used);
 	}
-
-	/* Sync to be conservative, in case we patched following
-	 * instructions */
-	sync_core();
 }
 extern struct paravirt_patch_site __start_parainstructions[],
 	__stop_parainstructions[];
@@ -367,6 +375,14 @@ void __init alternative_instructions(void)
 {
 	unsigned long flags;
 
+	/* The patching is not fully atomic, so try to avoid local interruptions
+	   that might execute the to be patched code.
+	   Other CPUs are not running. */
+	stop_nmi();
+#ifdef CONFIG_MCE
+	stop_mce();
+#endif
+
 	local_irq_save(flags);
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
@@ -376,8 +392,6 @@ void __init alternative_instructions(void)
 #ifdef CONFIG_HOTPLUG_CPU
 	if (num_possible_cpus() < 2)
 		smp_alt_once = 1;
-#else
-	smp_alt_once = 1;
 #endif
 
 #ifdef CONFIG_SMP
@@ -401,4 +415,37 @@ void __init alternative_instructions(void)
 #endif
  	apply_paravirt(__parainstructions, __parainstructions_end);
 	local_irq_restore(flags);
+
+	restart_nmi();
+#ifdef CONFIG_MCE
+	restart_mce();
+#endif
+}
+
+/*
+ * Warning:
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these instructions.
+ * And on the local CPU you need to be protected again NMI or MCE handlers
+ * seeing an inconsistent instruction while you patch.
+ */
+void __kprobes text_poke(void *oaddr, unsigned char *opcode, int len)
+{
+        u8 *addr = oaddr;
+	if (!pte_write(*lookup_address((unsigned long)addr))) {
+		struct page *p[2] = { virt_to_page(addr), virt_to_page(addr+PAGE_SIZE) };
+		addr = vmap(p, 2, VM_MAP, PAGE_KERNEL);
+		if (!addr)
+			return;
+		addr += ((unsigned long)oaddr) % PAGE_SIZE;
+	}
+	memcpy(addr, opcode, len);
+	sync_core();
+	/* Not strictly needed, but can speed CPU recovery up. Ignore cross cacheline
+	   case. */
+	if (cpu_has_clflush)
+		asm("clflush (%0) " :: "r" (oaddr) : "memory");
+	if (addr != oaddr)
+		vunmap(addr);
 }
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 67824f3bb974..bfc6cb7df7e7 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
 		apic_write_around(APIC_LVTT, v);
 		break;
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
+		break;
 	}
 
 	local_irq_restore(flags);
@@ -315,7 +318,7 @@ static void __devinit setup_APIC_timer(void)
 
 #define LAPIC_CAL_LOOPS		(HZ/10)
 
-static __initdata volatile int lapic_cal_loops = -1;
+static __initdata int lapic_cal_loops = -1;
 static __initdata long lapic_cal_t1, lapic_cal_t2;
 static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
 static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
@@ -485,7 +488,7 @@ void __init setup_boot_APIC_clock(void)
 		/* Let the interrupts run */
 		local_irq_enable();
 
-		while(lapic_cal_loops <= LAPIC_CAL_LOOPS)
+		while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
 			cpu_relax();
 
 		local_irq_disable();
@@ -521,6 +524,9 @@ void __init setup_boot_APIC_clock(void)
 		 */
 		if (nmi_watchdog != NMI_IO_APIC)
 			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+		else
+			printk(KERN_WARNING "APIC timer registered as dummy,"
+			       " due to nmi_watchdog=1!\n");
 	}
 
 	/* Setup the lapic or request the broadcast */
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 4112afe712b9..47001d50a083 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -222,6 +222,7 @@
 #include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/freezer.h>
 #include <linux/smp.h>
 #include <linux/dmi.h>
 #include <linux/suspend.h>
@@ -2311,7 +2312,6 @@ static int __init apm_init(void)
 		remove_proc_entry("apm", NULL);
 		return err;
 	}
-	kapmd_task->flags |= PF_NOFREEZE;
 	wake_up_process(kapmd_task);
 
 	if (num_online_cpus() > 1 && !smp ) {
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 27a776c9044d..7288ac88d746 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -17,6 +17,13 @@
 #include <asm/thread_info.h>
 #include <asm/elf.h>
 
+#include <xen/interface/xen.h>
+
+#ifdef CONFIG_LGUEST_GUEST
+#include <linux/lguest.h>
+#include "../../../drivers/lguest/lg.h"
+#endif
+
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
@@ -59,6 +66,7 @@ void foo(void)
 	OFFSET(TI_addr_limit, thread_info, addr_limit);
 	OFFSET(TI_restart_block, thread_info, restart_block);
 	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+	OFFSET(TI_cpu, thread_info, cpu);
 	BLANK();
 
 	OFFSET(GDS_size, Xgt_desc_struct, size);
@@ -115,4 +123,25 @@ void foo(void)
 	OFFSET(PARAVIRT_iret, paravirt_ops, iret);
 	OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
 #endif
+
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#endif
+
+#ifdef CONFIG_LGUEST_GUEST
+	BLANK();
+	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
+	OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
+	OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
+	OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp);
+	OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc);
+	OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc);
+	OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt);
+	OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum);
+	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
+	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
+#endif
 }
diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile
index 0b6a8551e9e2..778396c78d65 100644
--- a/arch/i386/kernel/cpu/Makefile
+++ b/arch/i386/kernel/cpu/Makefile
@@ -9,7 +9,6 @@ obj-y	+=	cyrix.o
 obj-y	+=	centaur.o
 obj-y	+=	transmeta.o
 obj-y	+=	intel.o intel_cacheinfo.o addon_cpuid_features.o
-obj-y	+=	rise.o
 obj-y	+=	nexgen.o
 obj-y	+=	umc.o
 
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 6f47eeeb93ea..c7ba455d5ac7 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -231,6 +231,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 
 	switch (c->x86) {
 	case 15:
+	/* Use K8 tuning for Fam10h and Fam11h */
+	case 0x10:
+	case 0x11:
 		set_bit(X86_FEATURE_K8, c->x86_capability);
 		break;
 	case 6:
@@ -272,8 +275,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	if (cpuid_eax(0x80000000) >= 0x80000006)
-		num_cache_leaves = 3;
+	if (cpuid_eax(0x80000000) >= 0x80000006) {
+		if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
+			num_cache_leaves = 4;
+		else
+			num_cache_leaves = 3;
+	}
 
 	if (amd_apic_timer_broken())
 		set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index e5419a9dec88..d506201d397c 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -606,7 +606,6 @@ extern int nsc_init_cpu(void);
 extern int amd_init_cpu(void);
 extern int centaur_init_cpu(void);
 extern int transmeta_init_cpu(void);
-extern int rise_init_cpu(void);
 extern int nexgen_init_cpu(void);
 extern int umc_init_cpu(void);
 
@@ -618,7 +617,6 @@ void __init early_cpu_init(void)
 	amd_init_cpu();
 	centaur_init_cpu();
 	transmeta_init_cpu();
-	rise_init_cpu();
 	nexgen_init_cpu();
 	umc_init_cpu();
 	early_cpu_detect();
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 18c8b67ea3a7..6f846bee2103 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -665,8 +665,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	data->max_freq = perf->states[0].core_frequency * 1000;
 	/* table init */
 	for (i=0; i<perf->state_count; i++) {
-		if (i>0 && perf->states[i].core_frequency ==
-		    perf->states[i-1].core_frequency)
+		if (i>0 && perf->states[i].core_frequency >=
+		    data->freq_table[valid_states-1].frequency / 1000)
 			continue;
 
 		data->freq_table[valid_states].index = i;
diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
index 194144539a6f..461dabc4e495 100644
--- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
@@ -79,7 +79,7 @@
 #include <linux/smp.h>
 #include <linux/cpufreq.h>
 #include <linux/pci.h>
-#include <asm/processor.h>
+#include <asm/processor-cyrix.h>
 #include <asm/errno.h>
 
 /* PCI config registers, all at F0 */
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index e88d2fba156b..122d2d75aa9f 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -4,7 +4,7 @@
 #include <linux/pci.h>
 #include <asm/dma.h>
 #include <asm/io.h>
-#include <asm/processor.h>
+#include <asm/processor-cyrix.h>
 #include <asm/timer.h>
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index e5be819492ef..d5a456d27d82 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -4,7 +4,7 @@
  *      Changes:
  *      Venkatesh Pallipadi	: Adding cache identification through cpuid(4)
  *		Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- *	Andi Kleen		: CPUID4 emulation on AMD.
+ *	Andi Kleen / Andreas Herrmann	: CPUID4 emulation on AMD.
  */
 
 #include <linux/init.h>
@@ -135,7 +135,7 @@ unsigned short			num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
    information to the user.  This makes some assumptions about the machine:
-   No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+   L2 not shared, no SMT etc. that is currently true on AMD CPUs.
 
    In theory the TLBs could be reported as fake type (they are in "dummy").
    Maybe later */
@@ -159,13 +159,26 @@ union l2_cache {
 	unsigned val;
 };
 
+union l3_cache {
+	struct {
+		unsigned line_size : 8;
+		unsigned lines_per_tag : 4;
+		unsigned assoc : 4;
+		unsigned res : 2;
+		unsigned size_encoded : 14;
+	};
+	unsigned val;
+};
+
 static const unsigned short assocs[] = {
 	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
-	[8] = 16,
+	[8] = 16, [0xa] = 32, [0xb] = 48,
+	[0xc] = 64,
 	[0xf] = 0xffff // ??
-	};
-static const unsigned char levels[] = { 1, 1, 2 };
-static const unsigned char types[] = { 1, 2, 3 };
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
 
 static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 		       union _cpuid4_leaf_ebx *ebx,
@@ -175,37 +188,58 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 	unsigned line_size, lines_per_tag, assoc, size_in_kb;
 	union l1_cache l1i, l1d;
 	union l2_cache l2;
+	union l3_cache l3;
+	union l1_cache *l1 = &l1d;
 
 	eax->full = 0;
 	ebx->full = 0;
 	ecx->full = 0;
 
 	cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
-	cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy);
-
-	if (leaf > 2 || !l1d.val || !l1i.val || !l2.val)
-		return;
-
-	eax->split.is_self_initializing = 1;
-	eax->split.type = types[leaf];
-	eax->split.level = levels[leaf];
-	eax->split.num_threads_sharing = 0;
-	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
-
-	if (leaf <= 1) {
-		union l1_cache *l1 = leaf == 0 ? &l1d : &l1i;
+	cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+
+	switch (leaf) {
+	case 1:
+		l1 = &l1i;
+	case 0:
+		if (!l1->val)
+			return;
 		assoc = l1->assoc;
 		line_size = l1->line_size;
 		lines_per_tag = l1->lines_per_tag;
 		size_in_kb = l1->size_in_kb;
-	} else {
+		break;
+	case 2:
+		if (!l2.val)
+			return;
 		assoc = l2.assoc;
 		line_size = l2.line_size;
 		lines_per_tag = l2.lines_per_tag;
 		/* cpu_data has errata corrections for K7 applied */
 		size_in_kb = current_cpu_data.x86_cache_size;
+		break;
+	case 3:
+		if (!l3.val)
+			return;
+		assoc = l3.assoc;
+		line_size = l3.line_size;
+		lines_per_tag = l3.lines_per_tag;
+		size_in_kb = l3.size_encoded * 512;
+		break;
+	default:
+		return;
 	}
 
+	eax->split.is_self_initializing = 1;
+	eax->split.type = types[leaf];
+	eax->split.level = levels[leaf];
+	if (leaf == 3)
+		eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+	else
+		eax->split.num_threads_sharing = 0;
+	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+
+
 	if (assoc == 0xf)
 		eax->split.is_fully_associative = 1;
 	ebx->split.coherency_line_size = line_size - 1;
@@ -239,8 +273,7 @@ static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le
 	return 0;
 }
 
-/* will only be called once; __init is safe here */
-static int __init find_num_cache_leaves(void)
+static int __cpuinit find_num_cache_leaves(void)
 {
 	unsigned int		eax, ebx, ecx, edx;
 	union _cpuid4_leaf_eax	cache_eax;
@@ -710,7 +743,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 	return retval;
 }
 
-static void __cpuexit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
 {
 	unsigned int cpu = sys_dev->id;
 	unsigned long i;
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 56cd485b127c..34c781eddee4 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -60,6 +60,20 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	}
 }
 
+static unsigned long old_cr4 __initdata;
+
+void __init stop_mce(void)
+{
+	old_cr4 = read_cr4();
+	clear_in_cr4(X86_CR4_MCE);
+}
+
+void __init restart_mce(void)
+{
+	if (old_cr4 & X86_CR4_MCE)
+		set_in_cr4(X86_CR4_MCE);
+}
+
 static int __init mcheck_disable(char *str)
 {
 	mce_disabled = 1;
diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c
index 6b5d3518a1c0..bf39409b3838 100644
--- a/arch/i386/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -57,7 +57,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
 static void mce_work_fn(struct work_struct *work)
 { 
 	on_each_cpu(mce_checkregs, NULL, 1, 1);
-	schedule_delayed_work(&mce_work, MCE_RATE);
+	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 } 
 
 static int __init init_nonfatal_mce_checker(void)
@@ -82,7 +82,7 @@ static int __init init_nonfatal_mce_checker(void)
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
-	schedule_delayed_work(&mce_work, MCE_RATE);
+	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 	printk(KERN_INFO "Machine check exception polling timer started.\n");
 	return 0;
 }
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 7ba7c3abd3a4..1203dc5ab87a 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
 	int err;
 
 	sys_dev = get_cpu_sysdev(cpu);
-	mutex_lock(&therm_cpu_lock);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
+		mutex_lock(&therm_cpu_lock);
 		err = thermal_throttle_add_dev(sys_dev);
+		mutex_unlock(&therm_cpu_lock);
 		WARN_ON(err);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
+		mutex_lock(&therm_cpu_lock);
 		thermal_throttle_remove_dev(sys_dev);
+		mutex_unlock(&therm_cpu_lock);
 		break;
 	}
-	mutex_unlock(&therm_cpu_lock);
 	return NOTIFY_OK;
 }
 
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 1001f1e0fe6d..2287d4863a8a 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -3,6 +3,7 @@
 #include <asm/mtrr.h>
 #include <asm/msr.h>
 #include <asm/io.h>
+#include <asm/processor-cyrix.h>
 #include "mtrr.h"
 
 int arr3_protected;
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index f6e46943e6ef..56f64e34829f 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -79,7 +79,7 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
 }
 
 /*  Grab all of the MTRR state for this CPU into *state  */
-void get_mtrr_state(void)
+void __init get_mtrr_state(void)
 {
 	unsigned int i;
 	struct mtrr_var_range *vrs;
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 75dc6d5214bc..c48b6fea5ab4 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -643,7 +643,7 @@ static struct sysdev_driver mtrr_sysdev_driver = {
  * initialized (i.e. before smp_init()).
  * 
  */
-__init void mtrr_bp_init(void)
+void __init mtrr_bp_init(void)
 {
 	init_ifs();
 
diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c
index 7b39a2f954d9..c9014ca4a575 100644
--- a/arch/i386/kernel/cpu/mtrr/state.c
+++ b/arch/i386/kernel/cpu/mtrr/state.c
@@ -3,6 +3,7 @@
 #include <asm/io.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
+#include <asm-i386/processor-cyrix.h>
 #include "mtrr.h"
 
 
diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c
index 4d26d514c56f..4be488e73bee 100644
--- a/arch/i386/kernel/cpu/perfctr-watchdog.c
+++ b/arch/i386/kernel/cpu/perfctr-watchdog.c
@@ -325,7 +325,7 @@ static struct wd_ops k7_wd_ops = {
 	.stop = single_msr_stop_watchdog,
 	.perfctr = MSR_K7_PERFCTR0,
 	.evntsel = MSR_K7_EVNTSEL0,
-	.checkbit = 1ULL<<63,
+	.checkbit = 1ULL<<47,
 };
 
 /* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */
@@ -346,7 +346,9 @@ static int setup_p6_watchdog(unsigned nmi_hz)
 	perfctr_msr = MSR_P6_PERFCTR0;
 	evntsel_msr = MSR_P6_EVNTSEL0;
 
-	wrmsrl(perfctr_msr, 0UL);
+	/* KVM doesn't implement this MSR */
+	if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
+		return 0;
 
 	evntsel = P6_EVNTSEL_INT
 		| P6_EVNTSEL_OS
@@ -599,8 +601,8 @@ static struct wd_ops intel_arch_wd_ops = {
 	.setup = setup_intel_arch_watchdog,
 	.rearm = p6_rearm,
 	.stop = single_msr_stop_watchdog,
-	.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
-	.evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr = MSR_ARCH_PERFMON_PERFCTR1,
+	.evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
 };
 
 static void probe_nmi_watchdog(void)
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
deleted file mode 100644
index 50076f22e90f..000000000000
--- a/arch/i386/kernel/cpu/rise.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-static void __cpuinit init_rise(struct cpuinfo_x86 *c)
-{
-	printk("CPU: Rise iDragon");
-	if (c->x86_model > 2)
-		printk(" II");
-	printk("\n");
-
-	/* Unhide possibly hidden capability flags
-	   The mp6 iDragon family don't have MSRs.
-	   We switch on extra features with this cpuid weirdness: */
-	__asm__ (
-		"movl $0x6363452a, %%eax\n\t"
-		"movl $0x3231206c, %%ecx\n\t"
-		"movl $0x2a32313a, %%edx\n\t"
-		"cpuid\n\t"
-		"movl $0x63634523, %%eax\n\t"
-		"movl $0x32315f6c, %%ecx\n\t"
-		"movl $0x2333313a, %%edx\n\t"
-		"cpuid\n\t" : : : "eax", "ebx", "ecx", "edx"
-	);
-	set_bit(X86_FEATURE_CX8, c->x86_capability);
-}
-
-static struct cpu_dev rise_cpu_dev __cpuinitdata = {
-	.c_vendor	= "Rise",
-	.c_ident	= { "RiseRiseRise" },
-	.c_models = {
-		{ .vendor = X86_VENDOR_RISE, .family = 5, .model_names = 
-		  { 
-			  [0] = "iDragon", 
-			  [2] = "iDragon", 
-			  [8] = "iDragon II", 
-			  [9] = "iDragon II"
-		  }
-		},
-	},
-	.c_init		= init_rise,
-};
-
-int __init rise_init_cpu(void)
-{
-	cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev;
-	return 0;
-}
-
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index fc822a46897a..e60cddbc4cfb 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -10,6 +10,7 @@
 #include <linux/efi.h>
 #include <linux/pfn.h>
 #include <linux/uaccess.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
@@ -320,6 +321,37 @@ static int __init request_standard_resources(void)
 
 subsys_initcall(request_standard_resources);
 
+#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND)
+/**
+ * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
+ * correspond to e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+	int i;
+	unsigned long pfn;
+
+	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (pfn < PFN_UP(ei->addr))
+			register_nosave_region(pfn, PFN_UP(ei->addr));
+
+		pfn = PFN_DOWN(ei->addr + ei->size);
+		if (ei->type != E820_RAM)
+			register_nosave_region(PFN_UP(ei->addr), pfn);
+
+		if (pfn >= max_low_pfn)
+			break;
+	}
+}
+#endif
+
 void __init add_memory_region(unsigned long long start,
 			      unsigned long long size, int type)
 {
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index a1808022ea19..2452c6fbe992 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -278,7 +278,7 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
 	struct range {
 		unsigned long start;
 		unsigned long end;
-	} prev, curr;
+	} uninitialized_var(prev), curr;
 	efi_memory_desc_t *md;
 	unsigned long start, end;
 	void *p;
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3c3c220488c9..a714d6b43506 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -409,8 +409,6 @@ restore_nocheck_notrace:
 1:	INTERRUPT_RETURN
 .section .fixup,"ax"
 iret_exc:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushl $0			# no error code
 	pushl $do_iret_error
 	jmp error_code
@@ -1023,6 +1021,91 @@ ENTRY(kernel_thread_helper)
 	CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
 
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+	CFI_STARTPROC
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	TRACE_IRQS_OFF
+
+	/* Check to see if we got the event in the critical
+	   region in xen_iret_direct, after we've reenabled
+	   events and checked for pending events.  This simulates
+	   iret instruction's behaviour where it delivers a
+	   pending interrupt when enabling interrupts. */
+	movl PT_EIP(%esp),%eax
+	cmpl $xen_iret_start_crit,%eax
+	jb   1f
+	cmpl $xen_iret_end_crit,%eax
+	jae  1f
+
+	call xen_iret_crit_fixup
+
+1:	mov %esp, %eax
+	call xen_evtchn_do_upcall
+	jmp  ret_from_intr
+	CFI_ENDPROC
+ENDPROC(xen_hypervisor_callback)
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+	CFI_STARTPROC
+	pushl %eax
+	CFI_ADJUST_CFA_OFFSET 4
+	movl $1,%eax
+1:	mov 4(%esp),%ds
+2:	mov 8(%esp),%es
+3:	mov 12(%esp),%fs
+4:	mov 16(%esp),%gs
+	testl %eax,%eax
+	popl %eax
+	CFI_ADJUST_CFA_OFFSET -4
+	lea 16(%esp),%esp
+	CFI_ADJUST_CFA_OFFSET -16
+	jz 5f
+	addl $16,%esp
+	jmp iret_exc		# EAX != 0 => Category 2 (Bad IRET)
+5:	pushl $0		# EAX == 0 => Category 1 (Bad segment)
+	CFI_ADJUST_CFA_OFFSET 4
+	SAVE_ALL
+	jmp ret_from_exception
+	CFI_ENDPROC
+
+.section .fixup,"ax"
+6:	xorl %eax,%eax
+	movl %eax,4(%esp)
+	jmp 1b
+7:	xorl %eax,%eax
+	movl %eax,8(%esp)
+	jmp 2b
+8:	xorl %eax,%eax
+	movl %eax,12(%esp)
+	jmp 3b
+9:	xorl %eax,%eax
+	movl %eax,16(%esp)
+	jmp 4b
+.previous
+.section __ex_table,"a"
+	.align 4
+	.long 1b,6b
+	.long 2b,7b
+	.long 3b,8b
+	.long 4b,9b
+.previous
+ENDPROC(xen_failsafe_callback)
+
+#endif	/* CONFIG_XEN */
+
 .section .rodata,"a"
 #include "syscall_table.S"
 
diff --git a/arch/i386/kernel/geode.c b/arch/i386/kernel/geode.c
new file mode 100644
index 000000000000..41e8aec4c61d
--- /dev/null
+++ b/arch/i386/kernel/geode.c
@@ -0,0 +1,155 @@
+/*
+ * AMD Geode southbridge support code
+ * Copyright (C) 2006, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <asm/msr.h>
+#include <asm/geode.h>
+
+static struct {
+	char *name;
+	u32 msr;
+	int size;
+	u32 base;
+} lbars[] = {
+	{ "geode-pms",   MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
+	{ "geode-acpi",  MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
+	{ "geode-gpio",  MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
+	{ "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
+};
+
+static void __init init_lbars(void)
+{
+	u32 lo, hi;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(lbars); i++) {
+		rdmsr(lbars[i].msr, lo, hi);
+		if (hi & 0x01)
+			lbars[i].base = lo & 0x0000ffff;
+
+		if (lbars[i].base == 0)
+			printk(KERN_ERR "geode:  Couldn't initialize '%s'\n",
+					lbars[i].name);
+	}
+}
+
+int geode_get_dev_base(unsigned int dev)
+{
+	BUG_ON(dev >= ARRAY_SIZE(lbars));
+	return lbars[dev].base;
+}
+EXPORT_SYMBOL_GPL(geode_get_dev_base);
+
+/* === GPIO API === */
+
+void geode_gpio_set(unsigned int gpio, unsigned int reg)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+	if (!base)
+		return;
+
+	if (gpio < 16)
+		outl(1 << gpio, base + reg);
+	else
+		outl(1 << (gpio - 16), base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set);
+
+void geode_gpio_clear(unsigned int gpio, unsigned int reg)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+	if (!base)
+		return;
+
+	if (gpio < 16)
+		outl(1 << (gpio + 16), base + reg);
+	else
+		outl(1 << gpio, base + 0x80 + reg);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_clear);
+
+int geode_gpio_isset(unsigned int gpio, unsigned int reg)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+
+	if (!base)
+		return 0;
+
+	if (gpio < 16)
+		return (inl(base + reg) & (1 << gpio)) ? 1 : 0;
+	else
+		return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0;
+}
+EXPORT_SYMBOL_GPL(geode_gpio_isset);
+
+void geode_gpio_set_irq(unsigned int group, unsigned int irq)
+{
+	u32 lo, hi;
+
+	if (group > 7 || irq > 15)
+		return;
+
+	rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+
+	lo &= ~(0xF << (group * 4));
+	lo |= (irq & 0xF) << (group * 4);
+
+	wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
+
+void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
+{
+	u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
+	u32 offset, shift, val;
+
+	if (gpio >= 24)
+		offset = GPIO_MAP_W;
+	else if (gpio >= 16)
+		offset = GPIO_MAP_Z;
+	else if (gpio >= 8)
+		offset = GPIO_MAP_Y;
+	else
+		offset = GPIO_MAP_X;
+
+	shift = (gpio % 8) * 4;
+
+	val = inl(base + offset);
+
+	/* Clear whatever was there before */
+	val &= ~(0xF << shift);
+
+	/* And set the new value */
+
+	val |= ((pair & 7) << shift);
+
+	/* Set the PME bit if this is a PME event */
+
+	if (pme)
+		val |= (1 << (shift + 3));
+
+	outl(val, base + offset);
+}
+EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
+
+static int __init geode_southbridge_init(void)
+{
+	if (!is_geode())
+		return -ENODEV;
+
+	init_lbars();
+	return 0;
+}
+
+postcore_initcall(geode_southbridge_init);
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index f74dfc419b56..7c52b222207e 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -168,6 +168,12 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 .section .init.text,"ax",@progbits
 #endif
 
+	/* Do an early initialization of the fixmap area */
+	movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
+	movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax
+	addl $0x007, %eax			/* 0x007 = PRESENT+RW+USER */
+	movl %eax, 4092(%edx)
+
 #ifdef CONFIG_SMP
 ENTRY(startup_32_smp)
 	cld
@@ -504,9 +510,12 @@ ENTRY(_stext)
 /*
  * BSS section
  */
-.section ".bss.page_aligned","w"
+.section ".bss.page_aligned","wa"
+	.align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
 	.fill 1024,4,0
+ENTRY(swapper_pg_pmd)
+	.fill 1024,4,0
 ENTRY(empty_zero_page)
 	.fill 4096,1,0
 
@@ -530,6 +539,8 @@ fault_msg:
 	.ascii "Int %d: CR2 %p  err %p  EIP %p  CS %p  flags %p\n"
 	.asciz "Stack: %p %p %p %p %p %p %p %p\n"
 
+#include "../xen/xen-head.S"
+
 /*
  * The IDT and GDT 'descriptors' are a strange 48-bit object
  * only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17d73459fc5f..533d4932bc79 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -5,6 +5,7 @@
 #include <linux/init.h>
 #include <linux/sysdev.h>
 #include <linux/pm.h>
+#include <linux/delay.h>
 
 #include <asm/hpet.h>
 #include <asm/io.h>
@@ -187,6 +188,10 @@ static void hpet_set_mode(enum clock_event_mode mode,
 		cfg &= ~HPET_TN_ENABLE;
 		hpet_writel(cfg, HPET_T0_CFG);
 		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		hpet_enable_int();
+		break;
 	}
 }
 
@@ -217,6 +222,7 @@ static struct clocksource clocksource_hpet = {
 	.mask		= HPET_MASK,
 	.shift		= HPET_SHIFT,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+	.resume		= hpet_start_counter,
 };
 
 /*
@@ -226,7 +232,8 @@ int __init hpet_enable(void)
 {
 	unsigned long id;
 	uint64_t hpet_freq;
-	u64 tmp;
+	u64 tmp, start, now;
+	cycle_t t1;
 
 	if (!is_hpet_capable())
 		return 0;
@@ -273,6 +280,27 @@ int __init hpet_enable(void)
 	/* Start the counter */
 	hpet_start_counter();
 
+	/* Verify whether hpet counter works */
+	t1 = read_hpet();
+	rdtscll(start);
+
+	/*
+	 * We don't know the TSC frequency yet, but waiting for
+	 * 200000 TSC cycles is safe:
+	 * 4 GHz == 50us
+	 * 1 GHz == 200us
+	 */
+	do {
+		rep_nop();
+		rdtscll(now);
+	} while ((now - start) < 200000UL);
+
+	if (t1 == read_hpet()) {
+		printk(KERN_WARNING
+		       "HPET counter not counting. HPET disabled\n");
+		goto out_nohpet;
+	}
+
 	/* Initialize and register HPET clocksource
 	 *
 	 * hpet period is in femto seconds per cycle
@@ -291,7 +319,6 @@ int __init hpet_enable(void)
 
 	clocksource_register(&clocksource_hpet);
 
-
 	if (id & HPET_ID_LEGSUP) {
 		hpet_enable_int();
 		hpet_reserve_platform_timers(id);
@@ -299,7 +326,7 @@ int __init hpet_enable(void)
 		 * Start hpet with the boot cpu mask and make it
 		 * global after the IO_APIC has been initialized.
 		 */
-		hpet_clockevent.cpumask =cpumask_of_cpu(0);
+		hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
 		clockevents_register_device(&hpet_clockevent);
 		global_clock_event = &hpet_clockevent;
 		return 1;
@@ -524,68 +551,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 #endif
-
-
-/*
- * Suspend/resume part
- */
-
-#ifdef CONFIG_PM
-
-static int hpet_suspend(struct sys_device *sys_device, pm_message_t state)
-{
-	unsigned long cfg = hpet_readl(HPET_CFG);
-
-	cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY);
-	hpet_writel(cfg, HPET_CFG);
-
-	return 0;
-}
-
-static int hpet_resume(struct sys_device *sys_device)
-{
-	unsigned int id;
-
-	hpet_start_counter();
-
-	id = hpet_readl(HPET_ID);
-
-	if (id & HPET_ID_LEGSUP)
-		hpet_enable_int();
-
-	return 0;
-}
-
-static struct sysdev_class hpet_class = {
-	set_kset_name("hpet"),
-	.suspend	= hpet_suspend,
-	.resume		= hpet_resume,
-};
-
-static struct sys_device hpet_device = {
-	.id		= 0,
-	.cls		= &hpet_class,
-};
-
-
-static __init int hpet_register_sysfs(void)
-{
-	int err;
-
-	if (!is_hpet_capable())
-		return 0;
-
-	err = sysdev_class_register(&hpet_class);
-
-	if (!err) {
-		err = sysdev_register(&hpet_device);
-		if (err)
-			sysdev_class_unregister(&hpet_class);
-	}
-
-	return err;
-}
-
-device_initcall(hpet_register_sysfs);
-
-#endif
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index f8a3c4054c70..6d839f2f1b1a 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -3,18 +3,17 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/jiffies.h>
-#include <linux/sysdev.h>
 #include <linux/module.h>
-#include <linux/init.h>
+#include <linux/spinlock.h>
 
 #include <asm/smp.h>
 #include <asm/delay.h>
 #include <asm/i8253.h>
 #include <asm/io.h>
-
-#include "io_ports.h"
+#include <asm/timer.h>
 
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
@@ -41,26 +40,27 @@ static void init_pit_timer(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_PERIODIC:
 		/* binary, mode 2, LSB/MSB, ch 0 */
 		outb_p(0x34, PIT_MODE);
-		udelay(10);
 		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-		udelay(10);
 		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
 		break;
 
-	/*
-	 * Avoid unnecessary state transitions, as it confuses
-	 * Geode / Cyrix based boxen.
-	 */
 	case CLOCK_EVT_MODE_SHUTDOWN:
-		if (evt->mode == CLOCK_EVT_MODE_UNUSED)
-			break;
 	case CLOCK_EVT_MODE_UNUSED:
-		if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN)
-			break;
+		if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
+		    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
+			outb_p(0x30, PIT_MODE);
+			outb_p(0, PIT_CH0);
+			outb_p(0, PIT_CH0);
+		}
+		break;
+
 	case CLOCK_EVT_MODE_ONESHOT:
 		/* One shot setup */
 		outb_p(0x38, PIT_MODE);
-		udelay(10);
+		break;
+
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here */
 		break;
 	}
 	spin_unlock_irqrestore(&i8253_lock, flags);
diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c
index cff95d10a4d8..d26fc063a760 100644
--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task);
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's.
  */ 
-DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
 
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 7f8b7af2b95f..893df8280756 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -353,14 +353,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 # include <linux/slab.h>		/* kmalloc() */
 # include <linux/timer.h>	/* time_after() */
  
-#ifdef CONFIG_BALANCED_IRQ_DEBUG
-#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0)
-#  define Dprintk(x...) do { TDprintk(x); } while (0)
-# else
-#  define TDprintk(x...) 
-#  define Dprintk(x...) 
-# endif
-
 #define IRQBALANCE_CHECK_ARCH -999
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
 #define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
@@ -443,7 +435,7 @@ static inline void balance_irq(int cpu, int irq)
 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
 {
 	int i, j;
-	Dprintk("Rotating IRQs among CPUs.\n");
+
 	for_each_online_cpu(i) {
 		for (j = 0; j < NR_IRQS; j++) {
 			if (!irq_desc[j].action)
@@ -560,19 +552,11 @@ tryanothercpu:
 	max_loaded = tmp_loaded;	/* processor */
 	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
 	
-	Dprintk("max_loaded cpu = %d\n", max_loaded);
-	Dprintk("min_loaded cpu = %d\n", min_loaded);
-	Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
-	Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
-	Dprintk("load imbalance = %lu\n", imbalance);
-
 	/* if imbalance is less than approx 10% of max load, then
 	 * observe diminishing returns action. - quit
 	 */
-	if (imbalance < (max_cpu_irq >> 3)) {
-		Dprintk("Imbalance too trivial\n");
+	if (imbalance < (max_cpu_irq >> 3))
 		goto not_worth_the_effort;
-	}
 
 tryanotherirq:
 	/* if we select an IRQ to move that can't go where we want, then
@@ -629,9 +613,6 @@ tryanotherirq:
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
 	if (!cpus_empty(tmp)) {
-
-		Dprintk("irq = %d moved to cpu = %d\n",
-				selected_irq, min_loaded);
 		/* mark for change destination */
 		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
 
@@ -651,7 +632,6 @@ not_worth_the_effort:
 	 */
 	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
 		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);	
-	Dprintk("IRQ worth rotating not found\n");
 	return;
 }
 
@@ -667,6 +647,7 @@ static int balanced_irq(void *unused)
 		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
+	set_freezable();
 	for ( ; ; ) {
 		time_remaining = schedule_timeout_interruptible(time_remaining);
 		try_to_freeze();
@@ -1901,7 +1882,7 @@ __setup("no_timer_check", notimercheck);
  *	- if this function detects that timer IRQs are defunct, then we fall
  *	  back to ISA timer IRQs
  */
-int __init timer_irq_works(void)
+static int __init timer_irq_works(void)
 {
 	unsigned long t1 = jiffies;
 
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index d2daf672f4a2..dd2b97fc00b2 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -21,7 +21,7 @@
 #include <asm/apic.h>
 #include <asm/uaccess.h>
 
-DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
@@ -149,15 +149,11 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
 
 #ifdef CONFIG_4KSTACKS
 
-/*
- * These should really be __section__(".bss.page_aligned") as well, but
- * gcc's 3.0 and earlier don't handle that correctly.
- */
 static char softirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__aligned__(THREAD_SIZE)));
+		__attribute__((__section__(".bss.page_aligned")));
 
 static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__aligned__(THREAD_SIZE)));
+		__attribute__((__section__(".bss.page_aligned")));
 
 /*
  * allocate per-cpu stacks for hardirq and for softirq processing
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index dde828a333c3..448a50b1324c 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/uaccess.h>
+#include <asm/alternative.h>
 
 void jprobe_return_end(void);
 
@@ -169,16 +170,12 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
 
 void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
-	*p->addr = BREAKPOINT_INSTRUCTION;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
 }
 
 void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
-	*p->addr = p->opcode;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	text_poke(p->addr, &p->opcode, 1);
 }
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index fba121f7973f..99beac7f96ce 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -295,7 +295,7 @@ static unsigned int
 	last_irq_sums [NR_CPUS],
 	alert_counter [NR_CPUS];
 
-void touch_nmi_watchdog (void)
+void touch_nmi_watchdog(void)
 {
 	if (nmi_watchdog > 0) {
 		unsigned cpu;
@@ -304,8 +304,10 @@ void touch_nmi_watchdog (void)
 		 * Just reset the alert counters, (other CPUs might be
 		 * spinning on locks we hold):
 		 */
-		for_each_present_cpu (cpu)
-			alert_counter[cpu] = 0;
+		for_each_present_cpu(cpu) {
+			if (alert_counter[cpu])
+				alert_counter[cpu] = 0;
+		}
 	}
 
 	/*
@@ -351,7 +353,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 	 * Take the local apic timer and PIT/HPET into account. We don't
 	 * know which one is active, when we have highres/dyntick on
 	 */
-	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
+	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0];
 
 	/* if the none of the timers isn't firing, this cpu isn't doing much */
 	if (!touched && last_irq_sums[cpu] == sum) {
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index faab09abca5e..ea962c0667d5 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -124,20 +124,28 @@ unsigned paravirt_patch_ignore(unsigned len)
 	return len;
 }
 
+struct branch {
+	unsigned char opcode;
+	u32 delta;
+} __attribute__((packed));
+
 unsigned paravirt_patch_call(void *target, u16 tgt_clobbers,
 			     void *site, u16 site_clobbers,
 			     unsigned len)
 {
 	unsigned char *call = site;
 	unsigned long delta = (unsigned long)target - (unsigned long)(call+5);
+	struct branch b;
 
 	if (tgt_clobbers & ~site_clobbers)
 		return len;	/* target would clobber too much for this site */
 	if (len < 5)
 		return len;	/* call too long for patch site */
 
-	*call++ = 0xe8;		/* call */
-	*(unsigned long *)call = delta;
+	b.opcode = 0xe8; /* call */
+	b.delta = delta;
+	BUILD_BUG_ON(sizeof(b) != 5);
+	text_poke(call, (unsigned char *)&b, 5);
 
 	return 5;
 }
@@ -146,12 +154,14 @@ unsigned paravirt_patch_jmp(void *target, void *site, unsigned len)
 {
 	unsigned char *jmp = site;
 	unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5);
+	struct branch b;
 
 	if (len < 5)
 		return len;	/* call too long for patch site */
 
-	*jmp++ = 0xe9;		/* jmp */
-	*(unsigned long *)jmp = delta;
+	b.opcode = 0xe9;	/* jmp */
+	b.delta = delta;
+	text_poke(jmp, (unsigned char *)&b, 5);
 
 	return 5;
 }
@@ -228,6 +238,41 @@ static int __init print_banner(void)
 }
 core_initcall(print_banner);
 
+static struct resource reserve_ioports = {
+	.start = 0,
+	.end = IO_SPACE_LIMIT,
+	.name = "paravirt-ioport",
+	.flags = IORESOURCE_IO | IORESOURCE_BUSY,
+};
+
+static struct resource reserve_iomem = {
+	.start = 0,
+	.end = -1,
+	.name = "paravirt-iomem",
+	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+/*
+ * Reserve the whole legacy IO space to prevent any legacy drivers
+ * from wasting time probing for their hardware.  This is a fairly
+ * brute-force approach to disabling all non-virtual drivers.
+ *
+ * Note that this must be called very early to have any effect.
+ */
+int paravirt_disable_iospace(void)
+{
+	int ret;
+
+	ret = request_resource(&ioport_resource, &reserve_ioports);
+	if (ret == 0) {
+		ret = request_resource(&iomem_resource, &reserve_iomem);
+		if (ret)
+			release_resource(&reserve_ioports);
+	}
+
+	return ret;
+}
+
 struct paravirt_ops paravirt_ops = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
@@ -267,7 +312,7 @@ struct paravirt_ops paravirt_ops = {
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.get_scheduled_cycles = native_read_tsc,
+	.sched_clock = native_sched_clock,
 	.get_cpu_khz = native_calculate_cpu_khz,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 06dfa65ad180..84664710b784 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -300,6 +300,7 @@ early_param("idle", idle_setup);
 void show_regs(struct pt_regs * regs)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+	unsigned long d0, d1, d2, d3, d6, d7;
 
 	printk("\n");
 	printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
@@ -324,6 +325,17 @@ void show_regs(struct pt_regs * regs)
 	cr3 = read_cr3();
 	cr4 = read_cr4_safe();
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
+
+	get_debugreg(d0, 0);
+	get_debugreg(d1, 1);
+	get_debugreg(d2, 2);
+	get_debugreg(d3, 3);
+	printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+			d0, d1, d2, d3);
+	get_debugreg(d6, 6);
+	get_debugreg(d7, 7);
+	printk("DR6: %08lx DR7: %08lx\n", d6, d7);
+
 	show_trace(NULL, regs, &regs->esp);
 }
 
@@ -538,8 +550,31 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
 	return 1;
 }
 
-static noinline void __switch_to_xtra(struct task_struct *next_p,
-				    struct tss_struct *tss)
+#ifdef CONFIG_SECCOMP
+void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+#endif /* CONFIG_SECCOMP */
+
+static noinline void
+__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+		 struct tss_struct *tss)
 {
 	struct thread_struct *next;
 
@@ -555,6 +590,17 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
 		set_debugreg(next->debugreg[7], 7);
 	}
 
+#ifdef CONFIG_SECCOMP
+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+#endif
+
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 		/*
 		 * Disable the bitmap via an invalid offset. We still cache
@@ -586,33 +632,6 @@ static noinline void __switch_to_xtra(struct task_struct *next_p,
 }
 
 /*
- * This function selects if the context switch from prev to next
- * has to tweak the TSC disable bit in the cr4.
- */
-static inline void disable_tsc(struct task_struct *prev_p,
-			       struct task_struct *next_p)
-{
-	struct thread_info *prev, *next;
-
-	/*
-	 * gcc should eliminate the ->thread_info dereference if
-	 * has_secure_computing returns 0 at compile time (SECCOMP=n).
-	 */
-	prev = task_thread_info(prev_p);
-	next = task_thread_info(next_p);
-
-	if (has_secure_computing(prev) || has_secure_computing(next)) {
-		/* slow path here */
-		if (has_secure_computing(prev) &&
-		    !has_secure_computing(next)) {
-			write_cr4(read_cr4() & ~X86_CR4_TSD);
-		} else if (!has_secure_computing(prev) &&
-			   has_secure_computing(next))
-			write_cr4(read_cr4() | X86_CR4_TSD);
-	}
-}
-
-/*
  *	switch_to(x,yn) should switch tasks from x to y.
  *
  * We fsave/fwait so that an exception goes off at the right time
@@ -689,11 +708,9 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	/*
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */
-	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
-	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
-		__switch_to_xtra(next_p, tss);
-
-	disable_tsc(prev_p, next_p);
+	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+		__switch_to_xtra(prev_p, next_p, tss);
 
 	/*
 	 * Leave lazy mode, flushing any hypercalls made here.
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 0c0ceec5de00..0c8f00e69c4d 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -164,14 +164,22 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_
 		u32 *desc;
 		unsigned long base;
 
-		down(&child->mm->context.sem);
-		desc = child->mm->context.ldt + (seg & ~7);
-		base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000);
+		seg &= ~7UL;
 
-		/* 16-bit code segment? */
-		if (!((desc[1] >> 22) & 1))
-			addr &= 0xffff;
-		addr += base;
+		down(&child->mm->context.sem);
+		if (unlikely((seg >> 3) >= child->mm->context.size))
+			addr = -1L; /* bogus selector, access would fault */
+		else {
+			desc = child->mm->context.ldt + seg;
+			base = ((desc[0] >> 16) |
+				((desc[1] & 0xff) << 16) |
+				(desc[1] & 0xff000000));
+
+			/* 16-bit code segment? */
+			if (!((desc[1] >> 22) & 1))
+				addr &= 0xffff;
+			addr += base;
+		}
 		up(&child->mm->context.sem);
 	}
 	return addr;
@@ -358,17 +366,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp, datap);
+	case PTRACE_PEEKDATA:
+		ret = generic_ptrace_peekdata(child, addr, data);
 		break;
-	}
 
 	/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
@@ -395,10 +395,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/* when I and D space are separate, this will have to be fixed. */
 	case PTRACE_POKETEXT: /* write the word at location addr. */
 	case PTRACE_POKEDATA:
-		ret = 0;
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data))
-			break;
-		ret = -EIO;
+		ret = generic_ptrace_pokedata(child, addr, data);
 		break;
 
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 5513f8d5b5be..0d796248866c 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -113,6 +113,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
 		},
 	},
+	{       /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+		.callback = set_bios_reboot,
+		.ident = "Dell OptiPlex 745",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+			DMI_MATCH(DMI_BOARD_NAME, "0WF810"),
+		},
+	},
 	{	/* Handle problems with rebooting on Dell 2400's */
 		.callback = set_bios_reboot,
 		.ident = "Dell PowerEdge 2400",
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 2d61e65eeb50..d474cd639bcb 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -273,18 +273,18 @@ unsigned long __init find_max_low_pfn(void)
 		printk(KERN_WARNING "Warning only %ldMB will be used.\n",
 					MAXMEM>>20);
 		if (max_pfn > MAX_NONPAE_PFN)
-			printk(KERN_WARNING "Use a PAE enabled kernel.\n");
+			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
 		else
 			printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
 		max_pfn = MAXMEM_PFN;
 #else /* !CONFIG_HIGHMEM */
-#ifndef CONFIG_X86_PAE
+#ifndef CONFIG_HIGHMEM64G
 		if (max_pfn > MAX_NONPAE_PFN) {
 			max_pfn = MAX_NONPAE_PFN;
 			printk(KERN_WARNING "Warning only 4GB will be used.\n");
-			printk(KERN_WARNING "Use a PAE enabled kernel.\n");
+			printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
 		}
-#endif /* !CONFIG_X86_PAE */
+#endif /* !CONFIG_HIGHMEM64G */
 #endif /* !CONFIG_HIGHMEM */
 	} else {
 		if (highmem_pages == -1)
@@ -466,7 +466,7 @@ void __init setup_bootmem_allocator(void)
  *
  * This should all compile down to nothing when NUMA is off.
  */
-void __init remapped_pgdat_init(void)
+static void __init remapped_pgdat_init(void)
 {
 	int nid;
 
@@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p)
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
 
+	paravirt_post_allocator_init();
+
 	dmi_scan_machine();
 
 #ifdef CONFIG_X86_GENERICARCH
@@ -638,6 +640,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	e820_register_memory();
+	e820_mark_nosave_regions();
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index d574e38f0f77..f5dd85656c18 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
 	return eax;
 
 badframe:
+	if (show_unhandled_signals && printk_ratelimit())
+		printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx"
+		       " esp:%lx oeax:%lx\n",
+		    current->pid > 1 ? KERN_INFO : KERN_EMERG,
+		    current->comm, current->pid, frame, regs->eip,
+		    regs->esp, regs->orig_eax);
+
 	force_sig(SIGSEGV, current);
 	return 0;
 }	
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 6299c080f6e2..2d35d8502029 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -22,6 +22,7 @@
 
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
 #include <mach_apic.h>
 
 /*
@@ -249,13 +250,13 @@ static unsigned long flush_va;
 static DEFINE_SPINLOCK(tlbstate_lock);
 
 /*
- * We cannot call mmdrop() because we are in interrupt context, 
+ * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
  *
  * We need to reload %cr3 since the page tables may be going
  * away from under us..
  */
-static inline void leave_mm (unsigned long cpu)
+void leave_mm(unsigned long cpu)
 {
 	if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
 		BUG();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 0b2954534b8e..e4f61d1c6248 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -148,7 +148,7 @@ void __init smp_alloc_memory(void)
  * a given CPU
  */
 
-static void __cpuinit smp_store_cpu_info(int id)
+void __cpuinit smp_store_cpu_info(int id)
 {
 	struct cpuinfo_x86 *c = cpu_data + id;
 
@@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu)
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
-static inline void
-set_cpu_sibling_map(int cpu)
+void __cpuinit set_cpu_sibling_map(int cpu)
 {
 	int i;
 	struct cpuinfo_x86 *c = cpu_data;
@@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void
-remove_siblinginfo(int cpu)
+void remove_siblinginfo(int cpu)
 {
 	int sibling;
 	struct cpuinfo_x86 *c = cpu_data;
diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c
index 1868ae18eb4d..bbfe85a0f699 100644
--- a/arch/i386/kernel/smpcommon.c
+++ b/arch/i386/kernel/smpcommon.c
@@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
 EXPORT_SYMBOL(smp_call_function);
 
 /**
- * smp_call_function_single - Run a function on another CPU
+ * smp_call_function_single - Run a function on a specific CPU
  * @cpu: The target CPU.  Cannot be the calling CPU.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
@@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	int ret;
 	int me = get_cpu();
 	if (cpu == me) {
-		WARN_ON(1);
+		local_irq_disable();
+		func(info);
+		local_irq_enable();
 		put_cpu();
-		return -EBUSY;
+		return 0;
 	}
 
 	ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index bf6adce52267..8344c70adf61 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -323,3 +323,4 @@ ENTRY(sys_call_table)
 	.long sys_signalfd
 	.long sys_timerfd
 	.long sys_eventfd
+	.long sys_fallocate
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index ff4ee6f3326b..6deb159d08e0 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -336,7 +336,9 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 
 int in_gate_area(struct task_struct *task, unsigned long addr)
 {
-	return 0;
+	const struct vm_area_struct *vma = get_gate_vma(task);
+
+	return vma && addr >= vma->vm_start && addr < vma->vm_end;
 }
 
 int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index a665df61f08c..19a6c678d02e 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void)
 	return retval;
 }
 
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-int no_sync_cmos_clock;
-
-static void sync_cmos_clock(unsigned long dummy)
-{
-	struct timeval now, next;
-	int fail = 1;
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
-	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
-		return;
-
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
-	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
+int update_persistent_clock(struct timespec now)
 {
-	if (!no_sync_cmos_clock)
-		mod_timer(&sync_cmos_timer, jiffies + 1);
+	return set_rtc_mmss(now.tv_sec);
 }
 
 extern void (*late_time_init)(void);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 90da0575fcff..cfffe3dd9e83 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -41,6 +41,10 @@
 #include <linux/mca.h>
 #endif
 
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -148,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 	if (!stack) {
 		unsigned long dummy;
 		stack = &dummy;
-		if (task && task != current)
+		if (task != current)
 			stack = (unsigned long *)task->thread.esp;
 	}
 
@@ -207,6 +211,7 @@ static void print_trace_address(void *data, unsigned long addr)
 {
 	printk("%s [<%08lx>] ", (char *)data, addr);
 	print_symbol("%s\n", addr);
+	touch_nmi_watchdog();
 }
 
 static struct stacktrace_ops print_trace_ops = {
@@ -390,7 +395,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 		unsigned long esp;
 		unsigned short ss;
 
-		report_bug(regs->eip);
+		report_bug(regs->eip, regs);
 
 		printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
 #ifdef CONFIG_PREEMPT
@@ -433,6 +438,7 @@ void die(const char * str, struct pt_regs * regs, long err)
 
 	bust_spinlocks(0);
 	die.lock_owner = -1;
+	add_taint(TAINT_DIE);
 	spin_unlock_irqrestore(&die.lock, flags);
 
 	if (!regs)
@@ -517,10 +523,12 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
 	do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
 }
 
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
 fastcall void do_##name(struct pt_regs * regs, long error_code) \
 { \
 	siginfo_t info; \
+	if (irq) \
+		local_irq_enable(); \
 	info.si_signo = signr; \
 	info.si_errno = 0; \
 	info.si_code = sicode; \
@@ -560,13 +568,13 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
 #endif
 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip)
+DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
 
 fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 					      long error_code)
@@ -610,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
 
 	current->thread.error_code = error_code;
 	current->thread.trap_no = 13;
+	if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
+	    printk_ratelimit())
+		printk(KERN_INFO
+		    "%s[%d] general protection eip:%lx esp:%lx error:%lx\n",
+		    current->comm, current->pid,
+		    regs->eip, regs->esp, error_code);
+
 	force_sig(SIGSEGV, current);
 	return;
 
@@ -635,6 +650,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
 		"CPU %d.\n", reason, smp_processor_id());
 	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+
+#if defined(CONFIG_EDAC)
+	if(edac_handler_set()) {
+		edac_atomic_assert_error();
+		return;
+	}
+#endif
+
 	if (panic_on_unrecovered_nmi)
                 panic("NMI: Not continuing");
 
@@ -752,6 +775,8 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
 	reassert_nmi();
 }
 
+static int ignore_nmis;
+
 fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
@@ -762,11 +787,24 @@ fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
 
 	++nmi_count(cpu);
 
-	default_do_nmi(regs);
+	if (!ignore_nmis)
+		default_do_nmi(regs);
 
 	nmi_exit();
 }
 
+void stop_nmi(void)
+{
+	acpi_nmi_disable();
+	ignore_nmis++;
+}
+
+void restart_nmi(void)
+{
+	ignore_nmis--;
+	acpi_nmi_enable();
+}
+
 #ifdef CONFIG_KPROBES
 fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
@@ -1053,6 +1091,7 @@ asmlinkage void math_state_restore(void)
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
 	tsk->fpu_counter++;
 }
+EXPORT_SYMBOL_GPL(math_state_restore);
 
 #ifndef CONFIG_MATH_EMULATION
 
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index ea63a30ca3e8..debd7dbb4158 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -27,6 +27,7 @@ static int tsc_enabled;
  * an extra value to store the TSC freq
  */
 unsigned int tsc_khz;
+EXPORT_SYMBOL_GPL(tsc_khz);
 
 int tsc_disable;
 
@@ -58,10 +59,11 @@ __setup("notsc", tsc_setup);
  */
 static int tsc_unstable;
 
-static inline int check_tsc_unstable(void)
+int check_tsc_unstable(void)
 {
 	return tsc_unstable;
 }
+EXPORT_SYMBOL_GPL(check_tsc_unstable);
 
 /* Accellerators for sched_clock()
  * convert from cycles(64bits) => nanoseconds (64bits)
@@ -84,7 +86,7 @@ static inline int check_tsc_unstable(void)
  *
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
-static unsigned long cyc2ns_scale __read_mostly;
+unsigned long cyc2ns_scale __read_mostly;
 
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
@@ -93,15 +95,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
 }
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
 {
 	unsigned long long this_offset;
 
@@ -118,12 +115,24 @@ unsigned long long sched_clock(void)
 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
 
 	/* read the Time Stamp Counter: */
-	get_scheduled_cycles(this_offset);
+	rdtscll(this_offset);
 
 	/* return the value in ns */
 	return cycles_2_ns(this_offset);
 }
 
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+	__attribute__((alias("native_sched_clock")));
+#endif
+
 unsigned long native_calculate_cpu_khz(void)
 {
 	unsigned long long start, end;
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
index c12720d7cbc5..72042bb7ec94 100644
--- a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
-static void vmi_allocate_pt(u32 pfn)
+static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
 {
 	vmi_set_page_type(pfn, VMI_PAGE_L1);
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -891,7 +891,7 @@ static inline int __init activate_vmi(void)
 		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
 		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
-		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
+		paravirt_ops.sched_clock = vmi_sched_clock;
  		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c
index 26a37f8a8762..b1b5ab08b26e 100644
--- a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -32,6 +32,7 @@
 #include <asm/apicdef.h>
 #include <asm/apic.h>
 #include <asm/timer.h>
+#include <asm/i8253.h>
 
 #include <irq_vectors.h>
 #include "io_ports.h"
@@ -64,10 +65,10 @@ int vmi_set_wallclock(unsigned long now)
 	return 0;
 }
 
-/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
-unsigned long long vmi_get_sched_cycles(void)
+/* paravirt_ops.sched_clock = vmi_sched_clock */
+unsigned long long vmi_sched_clock(void)
 {
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
 }
 
 /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
@@ -142,6 +143,7 @@ static void vmi_timer_set_mode(enum clock_event_mode mode,
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
 		break;
 	case CLOCK_EVT_MODE_PERIODIC:
 		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index aa87b06c7c82..7d72cce00529 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -60,7 +60,9 @@ SECTIONS
   	__stop___ex_table = .;
   }
 
-  BUG_TABLE
+  NOTES :text :note
+
+  BUG_TABLE :text
 
   . = ALIGN(4);
   .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
@@ -88,6 +90,7 @@ SECTIONS
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+	*(.data.page_aligned)
 	*(.data.idt)
   }
 
@@ -180,6 +183,7 @@ SECTIONS
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
 	__per_cpu_start = .;
 	*(.data.percpu)
+	*(.data.percpu.shared_aligned)
 	__per_cpu_end = .;
   }
   . = ALIGN(4096);
@@ -206,6 +210,4 @@ SECTIONS
   STABS_DEBUG
 
   DWARF_DEBUG
-
-  NOTES
 }
diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S
index d4b5be4f3d5f..07c0daf78237 100644
--- a/arch/i386/kernel/vsyscall-note.S
+++ b/arch/i386/kernel/vsyscall-note.S
@@ -3,23 +3,43 @@
  * Here we can supply some information useful to userland.
  */
 
-#include <linux/uts.h>
 #include <linux/version.h>
+#include <linux/elfnote.h>
 
-#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type)			      \
-	.section name, flags;						      \
-	.balign 4;							      \
-	.long 1f - 0f;		/* name length */			      \
-	.long 3f - 2f;		/* data length */			      \
-	.long type;		/* note type */				      \
-0:	.asciz vendor;		/* vendor name */			      \
-1:	.balign 4;							      \
-2:
+/* Ideally this would use UTS_NAME, but using a quoted string here
+   doesn't work. Remember to change this when changing the
+   kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+	.long LINUX_VERSION_CODE
+ELFNOTE_END
 
-#define ASM_ELF_NOTE_END						      \
-3:	.balign 4;		/* pad out section */			      \
-	.previous
+#ifdef CONFIG_XEN
+/*
+ * Add a special note telling glibc's dynamic linker a fake hardware
+ * flavor that it will use to choose the search path for libraries in the
+ * same way it uses real hardware capabilities like "mmx".
+ * We supply "nosegneg" as the fake capability, to indicate that we
+ * do not like negative offsets in instructions using segment overrides,
+ * since we implement those inefficiently.  This makes it possible to
+ * install libraries optimized to avoid those access patterns in someplace
+ * like /lib/i686/tls/nosegneg.  Note that an /etc/ld.so.conf.d/file
+ * corresponding to the bits here is needed to make ldconfig work right.
+ * It should contain:
+ *	hwcap 1 nosegneg
+ * to match the mapping of bit to name that we give here.
+ *
+ * At runtime, the fake hardware feature will be considered to be present
+ * if its bit is set in the mask word.  So, we start with the mask 0, and
+ * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
+ */
 
-	ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0)
-	.long LINUX_VERSION_CODE
-	ASM_ELF_NOTE_END
+#include "../xen/vdso.h"	/* Defines VDSO_NOTE_NONEGSEG_BIT.  */
+
+	.globl VDSO_NOTE_MASK
+ELFNOTE_START(GNU, 2, "a")
+	.long 1			/* ncaps */
+VDSO_NOTE_MASK:
+	.long 0			/* mask */
+	.byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg"	/* bit, name */
+ELFNOTE_END
+#endif
diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile
index 22d8ac5815f0..4d105fdfe817 100644
--- a/arch/i386/lib/Makefile
+++ b/arch/i386/lib/Makefile
@@ -4,7 +4,7 @@
 
 
 lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \
-	bitops.o semaphore.o
+	bitops.o semaphore.o string.o
 
 lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
 
diff --git a/arch/i386/lib/string.c b/arch/i386/lib/string.c
new file mode 100644
index 000000000000..2c773fefa3dd
--- /dev/null
+++ b/arch/i386/lib/string.c
@@ -0,0 +1,257 @@
+/*
+ * Most of the string-functions are rather heavily hand-optimized,
+ * see especially strsep,strstr,str[c]spn. They should work, but are not
+ * very easy to understand. Everything is done entirely within the register
+ * set, making the functions fast and clean. String instructions have been
+ * used through-out, making for "slightly" unclear code :-)
+ *
+ * AK: On P4 and K7 using non string instruction implementations might be faster
+ * for large memory blocks. But most of them are unlikely to be used on large
+ * strings.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+
+#ifdef __HAVE_ARCH_STRCPY
+char *strcpy(char * dest,const char *src)
+{
+	int d0, d1, d2;
+	asm volatile( "1:\tlodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2)
+		:"0" (src),"1" (dest) : "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strcpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCPY
+char *strncpy(char * dest,const char *src,size_t count)
+{
+	int d0, d1, d2, d3;
+	asm volatile( "1:\tdecl %2\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"rep\n\t"
+		"stosb\n"
+		"2:"
+		: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
+		:"0" (src),"1" (dest),"2" (count) : "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strncpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRCAT
+char *strcat(char * dest,const char * src)
+{
+	int d0, d1, d2, d3;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"decl %1\n"
+		"1:\tlodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+		: "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strcat);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCAT
+char *strncat(char * dest,const char * src,size_t count)
+{
+	int d0, d1, d2, d3;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"decl %1\n\t"
+		"movl %8,%3\n"
+		"1:\tdecl %3\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"stosb\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n"
+		"2:\txorl %2,%2\n\t"
+		"stosb"
+		: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+		: "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+		: "memory");
+	return dest;
+}
+EXPORT_SYMBOL(strncat);
+#endif
+
+#ifdef __HAVE_ARCH_STRCMP
+int strcmp(const char * cs,const char * ct)
+{
+	int d0, d1;
+	int res;
+	asm volatile( "1:\tlodsb\n\t"
+		"scasb\n\t"
+		"jne 2f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"xorl %%eax,%%eax\n\t"
+		"jmp 3f\n"
+		"2:\tsbbl %%eax,%%eax\n\t"
+		"orb $1,%%al\n"
+		"3:"
+		:"=a" (res), "=&S" (d0), "=&D" (d1)
+		:"1" (cs),"2" (ct)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strcmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCMP
+int strncmp(const char * cs,const char * ct,size_t count)
+{
+	int res;
+	int d0, d1, d2;
+	asm volatile( "1:\tdecl %3\n\t"
+		"js 2f\n\t"
+		"lodsb\n\t"
+		"scasb\n\t"
+		"jne 3f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n"
+		"2:\txorl %%eax,%%eax\n\t"
+		"jmp 4f\n"
+		"3:\tsbbl %%eax,%%eax\n\t"
+		"orb $1,%%al\n"
+		"4:"
+		:"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+		:"1" (cs),"2" (ct),"3" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strncmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRCHR
+char *strchr(const char * s, int c)
+{
+	int d0;
+	char * res;
+	asm volatile( "movb %%al,%%ah\n"
+		"1:\tlodsb\n\t"
+		"cmpb %%ah,%%al\n\t"
+		"je 2f\n\t"
+		"testb %%al,%%al\n\t"
+		"jne 1b\n\t"
+		"movl $1,%1\n"
+		"2:\tmovl %1,%0\n\t"
+		"decl %0"
+		:"=a" (res), "=&S" (d0)
+		:"1" (s),"0" (c)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRRCHR
+char *strrchr(const char * s, int c)
+{
+	int d0, d1;
+	char * res;
+	asm volatile( "movb %%al,%%ah\n"
+		"1:\tlodsb\n\t"
+		"cmpb %%ah,%%al\n\t"
+		"jne 2f\n\t"
+		"leal -1(%%esi),%0\n"
+		"2:\ttestb %%al,%%al\n\t"
+		"jne 1b"
+		:"=g" (res), "=&S" (d0), "=&a" (d1)
+		:"0" (0),"1" (s),"2" (c)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strrchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRLEN
+size_t strlen(const char * s)
+{
+	int d0;
+	int res;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"notl %0\n\t"
+		"decl %0"
+		:"=c" (res), "=&D" (d0)
+		:"1" (s),"a" (0), "0" (0xffffffffu)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strlen);
+#endif
+
+#ifdef __HAVE_ARCH_MEMCHR
+void *memchr(const void *cs,int c,size_t count)
+{
+	int d0;
+	void *res;
+	if (!count)
+		return NULL;
+	asm volatile( "repne\n\t"
+		"scasb\n\t"
+		"je 1f\n\t"
+		"movl $1,%0\n"
+		"1:\tdecl %0"
+		:"=D" (res), "=&c" (d0)
+		:"a" (c),"0" (cs),"1" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(memchr);
+#endif
+
+#ifdef __HAVE_ARCH_MEMSCAN
+void *memscan(void * addr, int c, size_t size)
+{
+	if (!size)
+		return addr;
+	asm volatile("repnz; scasb\n\t"
+	    "jnz 1f\n\t"
+	    "dec %%edi\n"
+	    "1:"
+	    : "=D" (addr), "=c" (size)
+	    : "0" (addr), "1" (size), "a" (c)
+	    : "memory");
+	return addr;
+}
+EXPORT_SYMBOL(memscan);
+#endif
+
+#ifdef __HAVE_ARCH_STRNLEN
+size_t strnlen(const char *s, size_t count)
+{
+	int d0;
+	int res;
+	asm volatile( "movl %2,%0\n\t"
+		"jmp 2f\n"
+		"1:\tcmpb $0,(%0)\n\t"
+		"je 3f\n\t"
+		"incl %0\n"
+		"2:\tdecl %1\n\t"
+		"cmpl $-1,%1\n\t"
+		"jne 1b\n"
+		"3:\tsubl %2,%0"
+		:"=a" (res), "=&d" (d0)
+		:"c" (s),"1" (count)
+		:"memory");
+	return res;
+}
+EXPORT_SYMBOL(strnlen);
+#endif
diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c
index b47f951c0ec2..4742626f08c4 100644
--- a/arch/i386/mach-generic/es7000.c
+++ b/arch/i386/mach-generic/es7000.c
@@ -66,4 +66,4 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 }
 #endif
 
-struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000);
+struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c
index b4b24e0e45e1..f9d595338159 100644
--- a/arch/i386/mach-voyager/voyager_thread.c
+++ b/arch/i386/mach-voyager/voyager_thread.c
@@ -52,7 +52,7 @@ execute(const char *string)
 		NULL,
 	};
 
-	if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) {
+	if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) {
 		printk(KERN_ERR "Voyager failed to run \"%s\": %i\n",
 		       string, ret);
 	}
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 1ecb3e43b523..01ffdd4964f0 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address)
 	return 0;
 }
 
+int show_unhandled_signals = 1;
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -303,6 +305,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
 	struct vm_area_struct * vma;
 	unsigned long address;
 	int write, si_code;
+	int fault;
 
 	/* get the address */
         address = read_cr2();
@@ -422,20 +425,18 @@ good_area:
 	 * make sure we exit gracefully rather than endlessly redo
 	 * the fault.
 	 */
-	switch (handle_mm_fault(mm, vma, address, write)) {
-		case VM_FAULT_MINOR:
-			tsk->min_flt++;
-			break;
-		case VM_FAULT_MAJOR:
-			tsk->maj_flt++;
-			break;
-		case VM_FAULT_SIGBUS:
-			goto do_sigbus;
-		case VM_FAULT_OOM:
+	fault = handle_mm_fault(mm, vma, address, write);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
-		default:
-			BUG();
+		else if (fault & VM_FAULT_SIGBUS)
+			goto do_sigbus;
+		BUG();
 	}
+	if (fault & VM_FAULT_MAJOR)
+		tsk->maj_flt++;
+	else
+		tsk->min_flt++;
 
 	/*
 	 * Did it hit the DOS screen memory VA from vm86 mode?
@@ -470,6 +471,14 @@ bad_area_nosemaphore:
 		if (is_prefetch(regs, address, error_code))
 			return;
 
+		if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+		    printk_ratelimit()) {
+			printk("%s%s[%d]: segfault at %08lx eip %08lx "
+			    "esp %08lx error %lx\n",
+			    tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+			    tsk->comm, tsk->pid, address, regs->eip,
+			    regs->esp, error_code);
+		}
 		tsk->thread.cr2 = address;
 		/* Kernel addresses are always protection faults */
 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 7135946d3663..c3b9905af2d5 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
 		pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
 
-		paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
+		paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
 		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
 		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
 	}
@@ -471,8 +471,13 @@ void zap_low_mappings (void)
 	flush_tlb_all();
 }
 
+int nx_enabled = 0;
+
+#ifdef CONFIG_X86_PAE
+
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 /*
  * noexec = on|off
@@ -499,9 +504,6 @@ static int __init noexec_setup(char *str)
 }
 early_param("noexec", noexec_setup);
 
-int nx_enabled = 0;
-#ifdef CONFIG_X86_PAE
-
 static void __init set_nx(void)
 {
 	unsigned int v[4], l, h;
@@ -751,8 +753,7 @@ void __init pgtable_cache_init(void)
 					PTRS_PER_PMD*sizeof(pmd_t),
 					PTRS_PER_PMD*sizeof(pmd_t),
 					SLAB_PANIC,
-					pmd_ctor,
-					NULL);
+					pmd_ctor);
 		if (!SHARED_KERNEL_PMD) {
 			/* If we're in PAE mode and have a non-shared
 			   kernel pmd, then the pgd size must be a
@@ -799,17 +800,9 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;
 
-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
-	/* It must still be possible to apply SMP alternatives. */
-	if (num_possible_cpus() <= 1)
-#endif
-	{
-		change_page_attr(virt_to_page(start),
-		                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
-		printk("Write protecting the kernel text: %luk\n", size >> 10);
-	}
-#endif
+	change_page_attr(virt_to_page(start),
+	                 size >> PAGE_SHIFT, PAGE_KERNEL_RX);
+	printk("Write protecting the kernel text: %luk\n", size >> 10);
 	start += size;
 	size = (unsigned long)__end_rodata - start;
 	change_page_attr(virt_to_page(start),
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
index fff08ae7b5ed..0b278315d737 100644
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -196,7 +196,7 @@ void iounmap(volatile void __iomem *addr)
 	/* Reset the direct mapping. Can block */
 	if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
 		change_page_attr(virt_to_page(__va(p->phys_addr)),
-				 p->size >> PAGE_SHIFT,
+				 get_vm_area_size(p) >> PAGE_SHIFT,
 				 PAGE_KERNEL);
 		global_flush_tlb();
 	} 
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 2eb14a73be9c..8927222b3ab2 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
 	address = __pa(address);
 	addr = address & LARGE_PAGE_MASK; 
 	pbase = (pte_t *)page_address(base);
-	paravirt_alloc_pt(page_to_pfn(base));
+	paravirt_alloc_pt(&init_mm, page_to_pfn(base));
 	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
                set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
                                           addr == address ? prot : ref_prot));
@@ -82,7 +82,7 @@ static void flush_kernel_map(void *arg)
 	struct page *p;
 
 	/* High level code is not ready for clflush yet */
-	if (0 && cpu_has_clflush) {
+	if (cpu_has_clflush) {
 		list_for_each_entry (p, lh, lru)
 			cache_flush_page(p);
 	} else if (boot_cpu_data.x86_model >= 4)
@@ -136,6 +136,12 @@ static inline void revert_page(struct page *kpte_page, unsigned long address)
 			    ref_prot));
 }
 
+static inline void save_page(struct page *kpte_page)
+{
+	if (!test_and_set_bit(PG_arch_1, &kpte_page->flags))
+		list_add(&kpte_page->lru, &df_list);
+}
+
 static int
 __change_page_attr(struct page *page, pgprot_t prot)
 { 
@@ -150,6 +156,9 @@ __change_page_attr(struct page *page, pgprot_t prot)
 	if (!kpte)
 		return -EINVAL;
 	kpte_page = virt_to_page(kpte);
+	BUG_ON(PageLRU(kpte_page));
+	BUG_ON(PageCompound(kpte_page));
+
 	if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { 
 		if (!pte_huge(*kpte)) {
 			set_pte_atomic(kpte, mk_pte(page, prot)); 
@@ -179,11 +188,11 @@ __change_page_attr(struct page *page, pgprot_t prot)
 	 * time (not via split_large_page) and in turn we must not
 	 * replace it with a largepage.
 	 */
+
+	save_page(kpte_page);
 	if (!PageReserved(kpte_page)) {
 		if (cpu_has_pse && (page_private(kpte_page) == 0)) {
-			ClearPagePrivate(kpte_page);
 			paravirt_release_pt(page_to_pfn(kpte_page));
-			list_add(&kpte_page->lru, &df_list);
 			revert_page(kpte_page, address);
 		}
 	}
@@ -236,6 +245,11 @@ void global_flush_tlb(void)
 	spin_unlock_irq(&cpa_lock);
 	flush_map(&l);
 	list_for_each_entry_safe(pg, next, &l, lru) {
+		list_del(&pg->lru);
+		clear_bit(PG_arch_1, &pg->flags);
+		if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0)
+			continue;
+		ClearPagePrivate(pg);
 		__free_page(pg);
 	}
 }
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 8d7c0864cc04..01437c46baae 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -235,7 +235,7 @@ static inline void pgd_list_del(pgd_t *pgd)
 
 #if (PTRS_PER_PMD == 1)
 /* Non-PAE pgd constructor */
-void pgd_ctor(void *pgd)
+static void pgd_ctor(void *pgd)
 {
 	unsigned long flags;
 
@@ -257,7 +257,7 @@ void pgd_ctor(void *pgd)
 }
 #else  /* PTRS_PER_PMD > 1 */
 /* PAE pgd constructor */
-void pgd_ctor(void *pgd)
+static void pgd_ctor(void *pgd)
 {
 	/* PAE, kernel PMD may be shared */
 
@@ -276,7 +276,7 @@ void pgd_ctor(void *pgd)
 }
 #endif	/* PTRS_PER_PMD */
 
-void pgd_dtor(void *pgd)
+static void pgd_dtor(void *pgd)
 {
 	unsigned long flags; /* can be called from interrupt context */
 
diff --git a/arch/i386/pci/acpi.c b/arch/i386/pci/acpi.c
index b33aea845f58..bc8a44bddaa7 100644
--- a/arch/i386/pci/acpi.c
+++ b/arch/i386/pci/acpi.c
@@ -8,20 +8,42 @@
 struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
 {
 	struct pci_bus *bus;
+	struct pci_sysdata *sd;
+	int pxm;
+
+	/* Allocate per-root-bus (not per bus) arch-specific data.
+	 * TODO: leak; this memory is never freed.
+	 * It's arguable whether it's worth the trouble to care.
+	 */
+	sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+	if (!sd) {
+		printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
+		return NULL;
+	}
 
 	if (domain != 0) {
 		printk(KERN_WARNING "PCI: Multiple domains not supported\n");
+		kfree(sd);
 		return NULL;
 	}
 
-	bus = pcibios_scan_root(busnum);
+	sd->node = -1;
+
+	pxm = acpi_get_pxm(device->handle);
+#ifdef CONFIG_ACPI_NUMA
+	if (pxm >= 0)
+		sd->node = pxm_to_node(pxm);
+#endif
+
+	bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
+	if (!bus)
+		kfree(sd);
+
 #ifdef CONFIG_ACPI_NUMA
 	if (bus != NULL) {
-		int pxm = acpi_get_pxm(device->handle);
 		if (pxm >= 0) {
-			bus->sysdata = (void *)(unsigned long)pxm_to_node(pxm);
-			printk("bus %d -> pxm %d -> node %ld\n",
-				busnum, pxm, (long)(bus->sysdata));
+			printk("bus %d -> pxm %d -> node %d\n",
+				busnum, pxm, sd->node);
 		}
 	}
 #endif
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 3f78d4d8ecf3..85503deeda46 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -293,6 +293,7 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
 struct pci_bus * __devinit pcibios_scan_root(int busnum)
 {
 	struct pci_bus *bus = NULL;
+	struct pci_sysdata *sd;
 
 	dmi_check_system(pciprobe_dmi_table);
 
@@ -303,9 +304,19 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 		}
 	}
 
+	/* Allocate per-root-bus (not per bus) arch-specific data.
+	 * TODO: leak; this memory is never freed.
+	 * It's arguable whether it's worth the trouble to care.
+	 */
+	sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+	if (!sd) {
+		printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
+		return NULL;
+	}
+
 	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
 
-	return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, NULL);
+	return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
 }
 
 extern u8 pci_cache_line_size;
diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c
index c7cabeed4d7b..4df637e34f81 100644
--- a/arch/i386/pci/mmconfig-shared.c
+++ b/arch/i386/pci/mmconfig-shared.c
@@ -24,6 +24,9 @@
 
 DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS);
 
+/* Indicate if the mmcfg resources have been placed into the resource table. */
+static int __initdata pci_mmcfg_resources_inserted;
+
 /* K8 systems have some devices (typically in the builtin northbridge)
    that are only accessible using type1
    Normally this can be expressed in the MCFG by not listing them
@@ -170,7 +173,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
 	return name != NULL;
 }
 
-static void __init pci_mmcfg_insert_resources(void)
+static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
 {
 #define PCI_MMCFG_RESOURCE_NAME_LEN 19
 	int i;
@@ -194,10 +197,13 @@ static void __init pci_mmcfg_insert_resources(void)
 			 cfg->pci_segment);
 		res->start = cfg->address;
 		res->end = res->start + (num_buses << 20) - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		res->flags = IORESOURCE_MEM | resource_flags;
 		insert_resource(&iomem_resource, res);
 		names += PCI_MMCFG_RESOURCE_NAME_LEN;
 	}
+
+	/* Mark that the resources have been inserted. */
+	pci_mmcfg_resources_inserted = 1;
 }
 
 static void __init pci_mmcfg_reject_broken(int type)
@@ -267,7 +273,43 @@ void __init pci_mmcfg_init(int type)
 		if (type == 1)
 			unreachable_devices();
 		if (known_bridge)
-			pci_mmcfg_insert_resources();
+			pci_mmcfg_insert_resources(IORESOURCE_BUSY);
 		pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
+	} else {
+		/*
+		 * Signal not to attempt to insert mmcfg resources because
+		 * the architecture mmcfg setup could not initialize.
+		 */
+		pci_mmcfg_resources_inserted = 1;
 	}
 }
+
+static int __init pci_mmcfg_late_insert_resources(void)
+{
+	/*
+	 * If resources are already inserted or we are not using MMCONFIG,
+	 * don't insert the resources.
+	 */
+	if ((pci_mmcfg_resources_inserted == 1) ||
+	    (pci_probe & PCI_PROBE_MMCONF) == 0 ||
+	    (pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].address == 0))
+		return 1;
+
+	/*
+	 * Attempt to insert the mmcfg resources but not with the busy flag
+	 * marked so it won't cause request errors when __request_region is
+	 * called.
+	 */
+	pci_mmcfg_insert_resources(0);
+
+	return 0;
+}
+
+/*
+ * Perform MMCONFIG resource insertion after PCI initialization to allow for
+ * misprogrammed MCFG tables that state larger sizes but actually conflict
+ * with other system resources.
+ */
+late_initcall(pci_mmcfg_late_insert_resources);
diff --git a/arch/i386/video/Makefile b/arch/i386/video/Makefile
new file mode 100644
index 000000000000..2c447c94adcc
--- /dev/null
+++ b/arch/i386/video/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_FB)               += fbdev.o
diff --git a/arch/i386/video/fbdev.c b/arch/i386/video/fbdev.c
new file mode 100644
index 000000000000..48fb38d7d2c0
--- /dev/null
+++ b/arch/i386/video/fbdev.c
@@ -0,0 +1,32 @@
+/*
+ * arch/i386/video/fbdev.c - i386 Framebuffer
+ *
+ * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ *
+ */
+#include <linux/fb.h>
+#include <linux/pci.h>
+
+int fb_is_primary_device(struct fb_info *info)
+{
+	struct device *device = info->device;
+	struct pci_dev *pci_dev = NULL;
+	struct resource *res = NULL;
+	int retval = 0;
+
+	if (device)
+		pci_dev = to_pci_dev(device);
+
+	if (pci_dev)
+		res = &pci_dev->resource[PCI_ROM_RESOURCE];
+
+	if (res && res->flags & IORESOURCE_ROM_SHADOW)
+		retval = 1;
+
+	return retval;
+}
+EXPORT_SYMBOL(fb_is_primary_device);
diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig
new file mode 100644
index 000000000000..9df99e1885a4
--- /dev/null
+++ b/arch/i386/xen/Kconfig
@@ -0,0 +1,11 @@
+#
+# This Kconfig describes xen options
+#
+
+config XEN
+	bool "Enable support for Xen hypervisor"
+	depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES
+	help
+	  This is the Linux Xen port.  Enabling this will allow the
+	  kernel to boot in a paravirtualized environment under the
+	  Xen hypervisor.
diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile
new file mode 100644
index 000000000000..343df246bd3e
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,4 @@
+obj-y		:= enlighten.o setup.o features.o multicalls.o mmu.o \
+			events.o time.o manage.o xen-asm.o
+
+obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c
new file mode 100644
index 000000000000..9a8c1181c001
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,1144 @@
+/*
+ * Core of Xen paravirt_ops implementation.
+ *
+ * This file contains the xen_paravirt_ops structure itself, and the
+ * implementations for:
+ * - privileged instructions
+ * - interrupt flags
+ * - segment operations
+ * - booting and setup
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/hardirq.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/sched.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/reboot.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+
+EXPORT_SYMBOL_GPL(hypercall_page);
+
+DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU(unsigned long, xen_cr3);
+
+struct start_info *xen_start_info;
+EXPORT_SYMBOL_GPL(xen_start_info);
+
+static /* __initdata */ struct shared_info dummy_shared_info;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+
+/*
+ * Flag to determine whether vcpu info placement is available on all
+ * VCPUs.  We assume it is to start with, and then set it to zero on
+ * the first failure.  This is because it can succeed on some VCPUs
+ * and not others, since it can involve hypervisor memory allocation,
+ * or because the guest failed to guarantee all the appropriate
+ * constraints on all VCPUs (ie buffer can't cross a page boundary).
+ *
+ * Note that any particular CPU may be using a placed vcpu structure,
+ * but we can only optimise if the all are.
+ *
+ * 0: not available, 1: available
+ */
+static int have_vcpu_info_placement = 1;
+
+static void __init xen_vcpu_setup(int cpu)
+{
+	struct vcpu_register_vcpu_info info;
+	int err;
+	struct vcpu_info *vcpup;
+
+	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+
+	if (!have_vcpu_info_placement)
+		return;		/* already tested, not available */
+
+	vcpup = &per_cpu(xen_vcpu_info, cpu);
+
+	info.mfn = virt_to_mfn(vcpup);
+	info.offset = offset_in_page(vcpup);
+
+	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n",
+	       cpu, vcpup, info.mfn, info.offset);
+
+	/* Check to see if the hypervisor will put the vcpu_info
+	   structure where we want it, which allows direct access via
+	   a percpu-variable. */
+	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
+
+	if (err) {
+		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+		have_vcpu_info_placement = 0;
+	} else {
+		/* This cpu is using the registered vcpu info, even if
+		   later ones fail to. */
+		per_cpu(xen_vcpu, cpu) = vcpup;
+
+		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+		       cpu, vcpup);
+	}
+}
+
+static void __init xen_banner(void)
+{
+	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+	       paravirt_ops.name);
+	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	unsigned maskedx = ~0;
+
+	/*
+	 * Mask out inconvenient features, to try and disable as many
+	 * unsupported kernel subsystems as possible.
+	 */
+	if (*eax == 1)
+		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
+			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
+			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+
+	asm(XEN_EMULATE_PREFIX "cpuid"
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+	*edx &= maskedx;
+}
+
+static void xen_set_debugreg(int reg, unsigned long val)
+{
+	HYPERVISOR_set_debugreg(reg, val);
+}
+
+static unsigned long xen_get_debugreg(int reg)
+{
+	return HYPERVISOR_get_debugreg(reg);
+}
+
+static unsigned long xen_save_fl(void)
+{
+	struct vcpu_info *vcpu;
+	unsigned long flags;
+
+	vcpu = x86_read_percpu(xen_vcpu);
+
+	/* flag has opposite sense of mask */
+	flags = !vcpu->evtchn_upcall_mask;
+
+	/* convert to IF type flag
+	   -0 -> 0x00000000
+	   -1 -> 0xffffffff
+	*/
+	return (-flags) & X86_EFLAGS_IF;
+}
+
+static void xen_restore_fl(unsigned long flags)
+{
+	struct vcpu_info *vcpu;
+
+	/* convert from IF type flag */
+	flags = !(flags & X86_EFLAGS_IF);
+
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = flags;
+	preempt_enable_no_resched();
+
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
+
+	if (flags == 0) {
+		preempt_check_resched();
+		barrier(); /* unmask then check (avoid races) */
+		if (unlikely(vcpu->evtchn_upcall_pending))
+			force_evtchn_callback();
+	}
+}
+
+static void xen_irq_disable(void)
+{
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+	preempt_enable_no_resched();
+}
+
+static void xen_irq_enable(void)
+{
+	struct vcpu_info *vcpu;
+
+	/* There's a one instruction preempt window here.  We need to
+	   make sure we're don't switch CPUs between getting the vcpu
+	   pointer and updating the mask. */
+	preempt_disable();
+	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu->evtchn_upcall_mask = 0;
+	preempt_enable_no_resched();
+
+	/* Doesn't matter if we get preempted here, because any
+	   pending event will get dealt with anyway. */
+
+	barrier(); /* unmask then check (avoid races) */
+	if (unlikely(vcpu->evtchn_upcall_pending))
+		force_evtchn_callback();
+}
+
+static void xen_safe_halt(void)
+{
+	/* Blocking includes an implicit local_irq_enable(). */
+	if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+		BUG();
+}
+
+static void xen_halt(void)
+{
+	if (irqs_disabled())
+		HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+	else
+		xen_safe_halt();
+}
+
+static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+	BUG_ON(preemptible());
+
+	switch (mode) {
+	case PARAVIRT_LAZY_NONE:
+		BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
+		break;
+
+	case PARAVIRT_LAZY_MMU:
+	case PARAVIRT_LAZY_CPU:
+		BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
+		break;
+
+	case PARAVIRT_LAZY_FLUSH:
+		/* flush if necessary, but don't change state */
+		if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
+			xen_mc_flush();
+		return;
+	}
+
+	xen_mc_flush();
+	x86_write_percpu(xen_lazy_mode, mode);
+}
+
+static unsigned long xen_store_tr(void)
+{
+	return 0;
+}
+
+static void xen_set_ldt(const void *addr, unsigned entries)
+{
+	unsigned long linear_addr = (unsigned long)addr;
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_SET_LDT;
+	if (linear_addr) {
+		/* ldt my be vmalloced, use arbitrary_virt_to_machine */
+		xmaddr_t maddr;
+		maddr = arbitrary_virt_to_machine((unsigned long)addr);
+		linear_addr = (unsigned long)maddr.maddr;
+	}
+	op->arg1.linear_addr = linear_addr;
+	op->arg2.nr_ents = entries;
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+	unsigned long *frames;
+	unsigned long va = dtr->address;
+	unsigned int size = dtr->size + 1;
+	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+	int f;
+	struct multicall_space mcs;
+
+	/* A GDT can be up to 64k in size, which corresponds to 8192
+	   8-byte entries, or 16 4k pages.. */
+
+	BUG_ON(size > 65536);
+	BUG_ON(va & ~PAGE_MASK);
+
+	mcs = xen_mc_entry(sizeof(*frames) * pages);
+	frames = mcs.args;
+
+	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
+		frames[f] = virt_to_mfn(va);
+		make_lowmem_page_readonly((void *)va);
+	}
+
+	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+				unsigned int cpu, unsigned int i)
+{
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	struct multicall_space mc = __xen_mc_entry(0);
+
+	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+	xen_mc_batch();
+
+	load_TLS_descriptor(t, cpu, 0);
+	load_TLS_descriptor(t, cpu, 1);
+	load_TLS_descriptor(t, cpu, 2);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+	/*
+	 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
+	 * it means we're in a context switch, and %gs has just been
+	 * saved.  This means we can zero it out to prevent faults on
+	 * exit from the hypervisor if the next process has no %gs.
+	 * Either way, it has been saved, and the new value will get
+	 * loaded properly.  This will go away as soon as Xen has been
+	 * modified to not save/restore %gs for normal hypercalls.
+	 */
+	if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+		loadsegment(gs, 0);
+}
+
+static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
+				u32 low, u32 high)
+{
+	unsigned long lp = (unsigned long)&dt[entrynum];
+	xmaddr_t mach_lp = virt_to_machine(lp);
+	u64 entry = (u64)high << 32 | low;
+
+	preempt_disable();
+
+	xen_mc_flush();
+	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+		BUG();
+
+	preempt_enable();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+			    struct trap_info *info)
+{
+	u8 type, dpl;
+
+	type = (high >> 8) & 0x1f;
+	dpl = (high >> 13) & 3;
+
+	if (type != 0xf && type != 0xe)
+		return 0;
+
+	info->vector = vector;
+	info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+	info->cs = low >> 16;
+	info->flags = dpl;
+	/* interrupt gates clear IF */
+	if (type == 0xe)
+		info->flags |= 4;
+
+	return 1;
+}
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
+				u32 low, u32 high)
+{
+	unsigned long p = (unsigned long)&dt[entrynum];
+	unsigned long start, end;
+
+	preempt_disable();
+
+	start = __get_cpu_var(idt_desc).address;
+	end = start + __get_cpu_var(idt_desc).size + 1;
+
+	xen_mc_flush();
+
+	write_dt_entry(dt, entrynum, low, high);
+
+	if (p >= start && (p + 8) <= end) {
+		struct trap_info info[2];
+
+		info[1].address = 0;
+
+		if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+			if (HYPERVISOR_set_trap_table(info))
+				BUG();
+	}
+
+	preempt_enable();
+}
+
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+				  struct trap_info *traps)
+{
+	unsigned in, out, count;
+
+	count = (desc->size+1) / 8;
+	BUG_ON(count > 256);
+
+	for (in = out = 0; in < count; in++) {
+		const u32 *entry = (u32 *)(desc->address + in * 8);
+
+		if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+			out++;
+	}
+	traps[out].address = 0;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+	const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+
+	xen_convert_trap_info(desc, traps);
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+	static DEFINE_SPINLOCK(lock);
+	static struct trap_info traps[257];
+
+	spin_lock(&lock);
+
+	__get_cpu_var(idt_desc) = *desc;
+
+	xen_convert_trap_info(desc, traps);
+
+	xen_mc_flush();
+	if (HYPERVISOR_set_trap_table(traps))
+		BUG();
+
+	spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
+				u32 low, u32 high)
+{
+	preempt_disable();
+
+	switch ((high >> 8) & 0xff) {
+	case DESCTYPE_LDT:
+	case DESCTYPE_TSS:
+		/* ignore */
+		break;
+
+	default: {
+		xmaddr_t maddr = virt_to_machine(&dt[entry]);
+		u64 desc = (u64)high << 32 | low;
+
+		xen_mc_flush();
+		if (HYPERVISOR_update_descriptor(maddr.maddr, desc))
+			BUG();
+	}
+
+	}
+
+	preempt_enable();
+}
+
+static void xen_load_esp0(struct tss_struct *tss,
+			  struct thread_struct *thread)
+{
+	struct multicall_space mcs = xen_mc_entry(0);
+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_set_iopl_mask(unsigned mask)
+{
+	struct physdev_set_iopl set_iopl;
+
+	/* Force the change at ring 0. */
+	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+}
+
+static void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_apic_read(unsigned long reg)
+{
+	return 0;
+}
+
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+#endif
+
+static void xen_flush_tlb(void)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_INVLPG_LOCAL;
+	op->arg1.linear_addr = addr & PAGE_MASK;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+				 unsigned long va)
+{
+	struct {
+		struct mmuext_op op;
+		cpumask_t mask;
+	} *args;
+	cpumask_t cpumask = *cpus;
+	struct multicall_space mcs;
+
+	/*
+	 * A couple of (to be removed) sanity checks:
+	 *
+	 * - current CPU must not be in mask
+	 * - mask must exist :)
+	 */
+	BUG_ON(cpus_empty(cpumask));
+	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+	BUG_ON(!mm);
+
+	/* If a CPU which we ran on has gone down, OK. */
+	cpus_and(cpumask, cpumask, cpu_online_map);
+	if (cpus_empty(cpumask))
+		return;
+
+	mcs = xen_mc_entry(sizeof(*args));
+	args = mcs.args;
+	args->mask = cpumask;
+	args->op.arg2.vcpumask = &args->mask;
+
+	if (va == TLB_FLUSH_ALL) {
+		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+	} else {
+		args->op.cmd = MMUEXT_INVLPG_MULTI;
+		args->op.arg1.linear_addr = va;
+	}
+
+	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+	return x86_read_percpu(xen_vcpu)->arch.cr2;
+}
+
+static unsigned long xen_read_cr2_direct(void)
+{
+	return x86_read_percpu(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_write_cr4(unsigned long cr4)
+{
+	/* never allow TSC to be disabled */
+	native_write_cr4(cr4 & ~X86_CR4_TSD);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+	return x86_read_percpu(xen_cr3);
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+	BUG_ON(preemptible());
+
+	if (cr3 == x86_read_percpu(xen_cr3)) {
+		/* just a simple tlb flush */
+		xen_flush_tlb();
+		return;
+	}
+
+	x86_write_percpu(xen_cr3, cr3);
+
+
+	{
+		struct mmuext_op *op;
+		struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+		unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+		op = mcs.args;
+		op->cmd = MMUEXT_NEW_BASEPTR;
+		op->arg1.mfn = mfn;
+
+		MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+		xen_mc_issue(PARAVIRT_LAZY_CPU);
+	}
+}
+
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+{
+	BUG_ON(mem_map);	/* should only be used early */
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		SetPagePinned(page);
+
+		if (!PageHighMem(page))
+			make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+		else
+			/* make sure there are no stray mappings of
+			   this page */
+			kmap_flush_unused();
+	}
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_pt(u32 pfn)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(page)) {
+		if (!PageHighMem(page))
+			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+	}
+}
+
+#ifdef CONFIG_HIGHPTE
+static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+	pgprot_t prot = PAGE_KERNEL;
+
+	if (PagePinned(page))
+		prot = PAGE_KERNEL_RO;
+
+	if (0 && PageHighMem(page))
+		printk("mapping highpte %lx type %d prot %s\n",
+		       page_to_pfn(page), type,
+		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
+
+	return kmap_atomic_prot(page, type, prot);
+}
+#endif
+
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
+	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
+		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+			       pte_val_ma(pte));
+
+	return pte;
+}
+
+/* Init-time set_pte while constructing initial pagetables, which
+   doesn't allow RO pagetable pages to be remapped RW */
+static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+{
+	pte = mask_rw_pte(ptep, pte);
+
+	xen_set_pte(ptep, pte);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+	/* special set_pte for pagetable initialization */
+	paravirt_ops.set_pte = xen_set_pte_init;
+
+	init_mm.pgd = base;
+	/*
+	 * copy top-level of Xen-supplied pagetable into place.	 For
+	 * !PAE we can use this as-is, but for PAE it is a stand-in
+	 * while we copy the pmd pages.
+	 */
+	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+	if (PTRS_PER_PMD > 1) {
+		int i;
+		/*
+		 * For PAE, need to allocate new pmds, rather than
+		 * share Xen's, since Xen doesn't like pmd's being
+		 * shared between address spaces.
+		 */
+		for (i = 0; i < PTRS_PER_PGD; i++) {
+			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+
+				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+				       PAGE_SIZE);
+
+				make_lowmem_page_readonly(pmd);
+
+				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+			} else
+				pgd_clear(&base[i]);
+		}
+	}
+
+	/* make sure zero_page is mapped RO so we can use it in pagetables */
+	make_lowmem_page_readonly(empty_zero_page);
+	make_lowmem_page_readonly(base);
+	/*
+	 * Switch to new pagetable.  This is done before
+	 * pagetable_init has done anything so that the new pages
+	 * added to the table can be prepared properly for Xen.
+	 */
+	xen_write_cr3(__pa(base));
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+	/* This will work as long as patching hasn't happened yet
+	   (which it hasn't) */
+	paravirt_ops.alloc_pt = xen_alloc_pt;
+	paravirt_ops.set_pte = xen_set_pte;
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		/*
+		 * Create a mapping for the shared info page.
+		 * Should be set_fixmap(), but shared_info is a machine
+		 * address with no corresponding pseudo-phys address.
+		 */
+		set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+			    PFN_DOWN(xen_start_info->shared_info),
+			    PAGE_KERNEL);
+
+		HYPERVISOR_shared_info =
+			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+	} else
+		HYPERVISOR_shared_info =
+			(struct shared_info *)__va(xen_start_info->shared_info);
+
+	/* Actually pin the pagetable down, but we can't set PG_pinned
+	   yet because the page structures don't exist yet. */
+	{
+		struct mmuext_op op;
+#ifdef CONFIG_X86_PAE
+		op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+		op.cmd = MMUEXT_PIN_L3_TABLE;
+#endif
+		op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base)));
+		if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+			BUG();
+	}
+}
+
+/* This is called once we have the cpu_possible_map */
+void __init xen_setup_vcpu_info_placement(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		xen_vcpu_setup(cpu);
+
+	/* xen_vcpu_setup managed to place the vcpu_info within the
+	   percpu area for all cpus, so make use of it */
+	if (have_vcpu_info_placement) {
+		printk(KERN_INFO "Xen: using vcpu_info placement\n");
+
+		paravirt_ops.save_fl = xen_save_fl_direct;
+		paravirt_ops.restore_fl = xen_restore_fl_direct;
+		paravirt_ops.irq_disable = xen_irq_disable_direct;
+		paravirt_ops.irq_enable = xen_irq_enable_direct;
+		paravirt_ops.read_cr2 = xen_read_cr2_direct;
+		paravirt_ops.iret = xen_iret_direct;
+	}
+}
+
+static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+	char *start, *end, *reloc;
+	unsigned ret;
+
+	start = end = reloc = NULL;
+
+#define SITE(x)								\
+	case PARAVIRT_PATCH(x):						\
+	if (have_vcpu_info_placement) {					\
+		start = (char *)xen_##x##_direct;			\
+		end = xen_##x##_direct_end;				\
+		reloc = xen_##x##_direct_reloc;				\
+	}								\
+	goto patch_site
+
+	switch (type) {
+		SITE(irq_enable);
+		SITE(irq_disable);
+		SITE(save_fl);
+		SITE(restore_fl);
+#undef SITE
+
+	patch_site:
+		if (start == NULL || (end-start) > len)
+			goto default_patch;
+
+		ret = paravirt_patch_insns(insns, len, start, end);
+
+		/* Note: because reloc is assigned from something that
+		   appears to be an array, gcc assumes it's non-null,
+		   but doesn't know its relationship with start and
+		   end. */
+		if (reloc > start && reloc < end) {
+			int reloc_off = reloc - start;
+			long *relocp = (long *)(insns + reloc_off);
+			long delta = start - (char *)insns;
+
+			*relocp += delta;
+		}
+		break;
+
+	default_patch:
+	default:
+		ret = paravirt_patch_default(type, clobbers, insns, len);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+	.paravirt_enabled = 1,
+	.shared_kernel_pmd = 0,
+
+	.name = "Xen",
+	.banner = xen_banner,
+
+	.patch = xen_patch,
+
+	.memory_setup = xen_memory_setup,
+	.arch_setup = xen_arch_setup,
+	.init_IRQ = xen_init_IRQ,
+	.post_allocator_init = xen_mark_init_mm_pinned,
+
+	.time_init = xen_time_init,
+	.set_wallclock = xen_set_wallclock,
+	.get_wallclock = xen_get_wallclock,
+	.get_cpu_khz = xen_cpu_khz,
+	.sched_clock = xen_sched_clock,
+
+	.cpuid = xen_cpuid,
+
+	.set_debugreg = xen_set_debugreg,
+	.get_debugreg = xen_get_debugreg,
+
+	.clts = native_clts,
+
+	.read_cr0 = native_read_cr0,
+	.write_cr0 = native_write_cr0,
+
+	.read_cr2 = xen_read_cr2,
+	.write_cr2 = xen_write_cr2,
+
+	.read_cr3 = xen_read_cr3,
+	.write_cr3 = xen_write_cr3,
+
+	.read_cr4 = native_read_cr4,
+	.read_cr4_safe = native_read_cr4_safe,
+	.write_cr4 = xen_write_cr4,
+
+	.save_fl = xen_save_fl,
+	.restore_fl = xen_restore_fl,
+	.irq_disable = xen_irq_disable,
+	.irq_enable = xen_irq_enable,
+	.safe_halt = xen_safe_halt,
+	.halt = xen_halt,
+	.wbinvd = native_wbinvd,
+
+	.read_msr = native_read_msr_safe,
+	.write_msr = native_write_msr_safe,
+	.read_tsc = native_read_tsc,
+	.read_pmc = native_read_pmc,
+
+	.iret = (void *)&hypercall_page[__HYPERVISOR_iret],
+	.irq_enable_sysexit = NULL,  /* never called */
+
+	.load_tr_desc = paravirt_nop,
+	.set_ldt = xen_set_ldt,
+	.load_gdt = xen_load_gdt,
+	.load_idt = xen_load_idt,
+	.load_tls = xen_load_tls,
+
+	.store_gdt = native_store_gdt,
+	.store_idt = native_store_idt,
+	.store_tr = xen_store_tr,
+
+	.write_ldt_entry = xen_write_ldt_entry,
+	.write_gdt_entry = xen_write_gdt_entry,
+	.write_idt_entry = xen_write_idt_entry,
+	.load_esp0 = xen_load_esp0,
+
+	.set_iopl_mask = xen_set_iopl_mask,
+	.io_delay = xen_io_delay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	.apic_write = xen_apic_write,
+	.apic_write_atomic = xen_apic_write,
+	.apic_read = xen_apic_read,
+	.setup_boot_clock = paravirt_nop,
+	.setup_secondary_clock = paravirt_nop,
+	.startup_ipi_hook = paravirt_nop,
+#endif
+
+	.flush_tlb_user = xen_flush_tlb,
+	.flush_tlb_kernel = xen_flush_tlb,
+	.flush_tlb_single = xen_flush_tlb_single,
+	.flush_tlb_others = xen_flush_tlb_others,
+
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+	.pagetable_setup_start = xen_pagetable_setup_start,
+	.pagetable_setup_done = xen_pagetable_setup_done,
+
+	.alloc_pt = xen_alloc_pt_init,
+	.release_pt = xen_release_pt,
+	.alloc_pd = paravirt_nop,
+	.alloc_pd_clone = paravirt_nop,
+	.release_pd = paravirt_nop,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+	.set_pte = NULL,	/* see xen_pagetable_setup_* */
+	.set_pte_at = xen_set_pte_at,
+	.set_pmd = xen_set_pmd,
+
+	.pte_val = xen_pte_val,
+	.pgd_val = xen_pgd_val,
+
+	.make_pte = xen_make_pte,
+	.make_pgd = xen_make_pgd,
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = xen_set_pte_atomic,
+	.set_pte_present = xen_set_pte_at,
+	.set_pud = xen_set_pud,
+	.pte_clear = xen_pte_clear,
+	.pmd_clear = xen_pmd_clear,
+
+	.make_pmd = xen_make_pmd,
+	.pmd_val = xen_pmd_val,
+#endif	/* PAE */
+
+	.activate_mm = xen_activate_mm,
+	.dup_mmap = xen_dup_mmap,
+	.exit_mmap = xen_exit_mmap,
+
+	.set_lazy_mode = xen_set_lazy_mode,
+};
+
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+	.smp_prepare_cpus = xen_smp_prepare_cpus,
+	.cpu_up = xen_cpu_up,
+	.smp_cpus_done = xen_smp_cpus_done,
+
+	.smp_send_stop = xen_smp_send_stop,
+	.smp_send_reschedule = xen_smp_send_reschedule,
+	.smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif	/* CONFIG_SMP */
+
+static void xen_reboot(int reason)
+{
+#ifdef CONFIG_SMP
+	smp_send_stop();
+#endif
+
+	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+		BUG();
+}
+
+static void xen_restart(char *msg)
+{
+	xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_emergency_restart(void)
+{
+	xen_reboot(SHUTDOWN_reboot);
+}
+
+static void xen_machine_halt(void)
+{
+	xen_reboot(SHUTDOWN_poweroff);
+}
+
+static void xen_crash_shutdown(struct pt_regs *regs)
+{
+	xen_reboot(SHUTDOWN_crash);
+}
+
+static const struct machine_ops __initdata xen_machine_ops = {
+	.restart = xen_restart,
+	.halt = xen_machine_halt,
+	.power_off = xen_machine_halt,
+	.shutdown = xen_machine_halt,
+	.crash_shutdown = xen_crash_shutdown,
+	.emergency_restart = xen_emergency_restart,
+};
+
+
+/* First C function to be called on Xen boot */
+asmlinkage void __init xen_start_kernel(void)
+{
+	pgd_t *pgd;
+
+	if (!xen_start_info)
+		return;
+
+	BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+	/* Install Xen paravirt ops */
+	paravirt_ops = xen_paravirt_ops;
+	machine_ops = xen_machine_ops;
+
+#ifdef CONFIG_SMP
+	smp_ops = xen_smp_ops;
+#endif
+
+	xen_setup_features();
+
+	/* Get mfn list */
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+
+	pgd = (pgd_t *)xen_start_info->pt_base;
+
+	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+	init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+	/* keep using Xen gdt for now; no urgent need to change it */
+
+	x86_write_percpu(xen_cr3, __pa(pgd));
+
+#ifdef CONFIG_SMP
+	/* Don't do the full vcpu_info placement stuff until we have a
+	   possible map. */
+	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+#else
+	/* May as well do it now, since there's no good time to call
+	   it later on UP. */
+	xen_setup_vcpu_info_placement();
+#endif
+
+	paravirt_ops.kernel_rpl = 1;
+	if (xen_feature(XENFEAT_supervisor_mode_kernel))
+		paravirt_ops.kernel_rpl = 0;
+
+	/* set the limit of our address space */
+	reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+	/* set up basic CPUID stuff */
+	cpu_detect(&new_cpu_data);
+	new_cpu_data.hard_math = 1;
+	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+
+	/* Poke various useful things into boot_params */
+	LOADER_TYPE = (9 << 4) | 0;
+	INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0;
+	INITRD_SIZE = xen_start_info->mod_len;
+
+	/* Start the world */
+	start_kernel();
+}
diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c
new file mode 100644
index 000000000000..da1b173547a1
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,591 @@
+/*
+ * Xen event channels
+ *
+ * Xen models interrupts with abstract event channels.  Because each
+ * domain gets 1024 event channels, but NR_IRQ is not that large, we
+ * must dynamically map irqs<->event channels.  The event channels
+ * interface with the rest of the kernel by defining a xen interrupt
+ * chip.  When an event is recieved, it is mapped to an irq and sent
+ * through the normal interrupt processing path.
+ *
+ * There are four kinds of events which can be mapped to an event
+ * channel:
+ *
+ * 1. Inter-domain notifications.  This includes all the virtual
+ *    device events, since they're driven by front-ends in another domain
+ *    (typically dom0).
+ * 2. VIRQs, typically used for timers.  These are per-cpu events.
+ * 3. IPIs.
+ * 4. Hardware interrupts. Not supported at present.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* IRQ <-> IPI mapping */
+static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+struct packed_irq
+{
+	unsigned short evtchn;
+	unsigned char index;
+	unsigned char type;
+};
+
+static struct packed_irq irq_info[NR_IRQS];
+
+/* Binding types. */
+enum {
+	IRQT_UNBOUND,
+	IRQT_PIRQ,
+	IRQT_VIRQ,
+	IRQT_IPI,
+	IRQT_EVTCHN
+};
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND	mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+	[0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)	((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+	(void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+	return (struct packed_irq) { evtchn, index, type };
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+	return irq_info[irq].evtchn;
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+	return irq_info[irq].index;
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+	return irq_info[irq].type;
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+					   struct shared_info *sh,
+					   unsigned int idx)
+{
+	return (sh->evtchn_pending[idx] &
+		cpu_evtchn_mask[cpu][idx] &
+		~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+	int irq = evtchn_to_irq[chn];
+
+	BUG_ON(irq == -1);
+#ifdef CONFIG_SMP
+	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
+#endif
+
+	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
+	__set_bit(chn, cpu_evtchn_mask[cpu]);
+
+	cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+#ifdef CONFIG_SMP
+	int i;
+	/* By default all event channels notify CPU#0. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_desc[i].affinity = cpumask_of_cpu(0);
+#endif
+
+	memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+	memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+	return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+static void mask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	sync_set_bit(port, &s->evtchn_mask[0]);
+}
+
+static void unmask_evtchn(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	unsigned int cpu = get_cpu();
+
+	BUG_ON(!irqs_disabled());
+
+	/* Slow path (hypercall) if this is a non-local port. */
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+
+		sync_clear_bit(port, &s->evtchn_mask[0]);
+
+		/*
+		 * The following is basically the equivalent of
+		 * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
+		 * the interrupt edge' if the channel is masked.
+		 */
+		if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+		    !sync_test_and_set_bit(port / BITS_PER_LONG,
+					   &vcpu_info->evtchn_pending_sel))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
+static int find_unbound_irq(void)
+{
+	int irq;
+
+	/* Only allocate from dynirq range */
+	for (irq = 0; irq < NR_IRQS; irq++)
+		if (irq_bindcount[irq] == 0)
+			break;
+
+	if (irq == NR_IRQS)
+		panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+	return irq;
+}
+
+int bind_evtchn_to_irq(unsigned int evtchn)
+{
+	int irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = evtchn_to_irq[evtchn];
+
+	if (irq == -1) {
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "event");
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(ipi_to_irq, cpu)[ipi];
+	if (irq == -1) {
+		irq = find_unbound_irq();
+		if (irq < 0)
+			goto out;
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "ipi");
+
+		bind_ipi.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+		per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+
+ out:
+	spin_unlock(&irq_mapping_update_lock);
+	return irq;
+}
+
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int evtchn, irq;
+
+	spin_lock(&irq_mapping_update_lock);
+
+	irq = per_cpu(virq_to_irq, cpu)[virq];
+
+	if (irq == -1) {
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		irq = find_unbound_irq();
+
+		dynamic_irq_init(irq);
+		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+					      handle_level_irq, "virq");
+
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+		per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+		bind_evtchn_to_cpu(evtchn, cpu);
+	}
+
+	irq_bindcount[irq]++;
+
+	spin_unlock(&irq_mapping_update_lock);
+
+	return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+	struct evtchn_close close;
+	int evtchn = evtchn_from_irq(irq);
+
+	spin_lock(&irq_mapping_update_lock);
+
+	if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+		close.port = evtchn;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+			BUG();
+
+		switch (type_from_irq(irq)) {
+		case IRQT_VIRQ:
+			per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+				[index_from_irq(irq)] = -1;
+			break;
+		default:
+			break;
+		}
+
+		/* Closed ports are implicitly re-bound to VCPU0. */
+		bind_evtchn_to_cpu(evtchn, 0);
+
+		evtchn_to_irq[evtchn] = -1;
+		irq_info[irq] = IRQ_UNBOUND;
+
+		dynamic_irq_init(irq);
+	}
+
+	spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+			      irqreturn_t (*handler)(int, void *),
+			      unsigned long irqflags,
+			      const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_evtchn_to_irq(evtchn);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+			    irqreturn_t (*handler)(int, void *),
+			    unsigned long irqflags, const char *devname, void *dev_id)
+{
+	unsigned int irq;
+	int retval;
+
+	irq = bind_virq_to_irq(virq, cpu);
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+			   unsigned int cpu,
+			   irq_handler_t handler,
+			   unsigned long irqflags,
+			   const char *devname,
+			   void *dev_id)
+{
+	int irq, retval;
+
+	irq = bind_ipi_to_irq(ipi, cpu);
+	if (irq < 0)
+		return irq;
+
+	retval = request_irq(irq, handler, irqflags, devname, dev_id);
+	if (retval != 0) {
+		unbind_from_irq(irq);
+		return retval;
+	}
+
+	return irq;
+}
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+	free_irq(irq, dev_id);
+	unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
+{
+	int irq = per_cpu(ipi_to_irq, cpu)[vector];
+	BUG_ON(irq < 0);
+	notify_remote_via_irq(irq);
+}
+
+
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+	int cpu = get_cpu();
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+	unsigned long pending_words;
+
+	vcpu_info->evtchn_upcall_pending = 0;
+
+	/* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+	pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+	while (pending_words != 0) {
+		unsigned long pending_bits;
+		int word_idx = __ffs(pending_words);
+		pending_words &= ~(1UL << word_idx);
+
+		while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+			int bit_idx = __ffs(pending_bits);
+			int port = (word_idx * BITS_PER_LONG) + bit_idx;
+			int irq = evtchn_to_irq[port];
+
+			if (irq != -1) {
+				regs->orig_eax = ~irq;
+				do_IRQ(regs);
+			}
+		}
+	}
+
+	put_cpu();
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+	struct evtchn_bind_vcpu bind_vcpu;
+	int evtchn = evtchn_from_irq(irq);
+
+	if (!VALID_EVTCHN(evtchn))
+		return;
+
+	/* Send future instances of this interrupt to other vcpu. */
+	bind_vcpu.port = evtchn;
+	bind_vcpu.vcpu = tcpu;
+
+	/*
+	 * If this fails, it usually just indicates that we're dealing with a
+	 * virq or IPI channel, which don't actually need to be rebound. Ignore
+	 * it, but don't do the xenlinux-level rebind in that case.
+	 */
+	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+		bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+	unsigned tcpu = first_cpu(dest);
+	rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+
+	move_native_irq(irq);
+
+	if (VALID_EVTCHN(evtchn))
+		clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+	int evtchn = evtchn_from_irq(irq);
+	int ret = 0;
+
+	if (VALID_EVTCHN(evtchn)) {
+		set_evtchn(evtchn);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+	.name		= "xen-dyn",
+	.mask		= disable_dynirq,
+	.unmask		= enable_dynirq,
+	.ack		= ack_dynirq,
+	.set_affinity	= set_affinity_irq,
+	.retrigger	= retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+	int i;
+
+	init_evtchn_cpu_bindings();
+
+	/* No event channels are 'live' right now. */
+	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+		mask_evtchn(i);
+
+	/* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+	for (i = 0; i < NR_IRQS; i++)
+		irq_bindcount[i] = 0;
+
+	irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c
new file mode 100644
index 000000000000..0707714e40d6
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/xen/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+	struct xen_feature_info fi;
+	int i, j;
+
+	for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+		fi.submap_idx = i;
+		if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+			break;
+		for (j = 0; j < 32; j++)
+			xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
+	}
+}
diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c
new file mode 100644
index 000000000000..aa7af9e6abc0
--- /dev/null
+++ b/arch/i386/xen/manage.c
@@ -0,0 +1,143 @@
+/*
+ * Handle extern requests for shutdown, reboot and sysrq
+ */
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+
+#include <xen/xenbus.h>
+
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+ * report a crash, not be instructed to crash!
+ * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+ * the distinction when we return the reason code to them.
+ */
+#define SHUTDOWN_HALT      4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+static void shutdown_handler(struct xenbus_watch *watch,
+			     const char **vec, unsigned int len)
+{
+	char *str;
+	struct xenbus_transaction xbt;
+	int err;
+
+	if (shutting_down != SHUTDOWN_INVALID)
+		return;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+
+	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+	/* Ignore read errors and empty reads. */
+	if (XENBUS_IS_ERR_READ(str)) {
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	xenbus_write(xbt, "control", "shutdown", "");
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN) {
+		kfree(str);
+		goto again;
+	}
+
+	if (strcmp(str, "poweroff") == 0 ||
+	    strcmp(str, "halt") == 0)
+		orderly_poweroff(false);
+	else if (strcmp(str, "reboot") == 0)
+		ctrl_alt_del();
+	else {
+		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
+		shutting_down = SHUTDOWN_INVALID;
+	}
+
+	kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+			  unsigned int len)
+{
+	char sysrq_key = '\0';
+	struct xenbus_transaction xbt;
+	int err;
+
+ again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		return;
+	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
+		printk(KERN_ERR "Unable to read sysrq code in "
+		       "control/sysrq\n");
+		xenbus_transaction_end(xbt, 1);
+		return;
+	}
+
+	if (sysrq_key != '\0')
+		xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+
+	if (sysrq_key != '\0')
+		handle_sysrq(sysrq_key, NULL);
+}
+
+static struct xenbus_watch shutdown_watch = {
+	.node = "control/shutdown",
+	.callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+	.node = "control/sysrq",
+	.callback = sysrq_handler
+};
+
+static int setup_shutdown_watcher(void)
+{
+	int err;
+
+	err = register_xenbus_watch(&shutdown_watch);
+	if (err) {
+		printk(KERN_ERR "Failed to set shutdown watcher\n");
+		return err;
+	}
+
+	err = register_xenbus_watch(&sysrq_watch);
+	if (err) {
+		printk(KERN_ERR "Failed to set sysrq watcher\n");
+		return err;
+	}
+
+	return 0;
+}
+
+static int shutdown_event(struct notifier_block *notifier,
+			  unsigned long event,
+			  void *data)
+{
+	setup_shutdown_watcher();
+	return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+	static struct notifier_block xenstore_notifier = {
+		.notifier_call = shutdown_event
+	};
+	register_xenstore_notifier(&xenstore_notifier);
+
+	return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 000000000000..4ae038aa6c24
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,564 @@
+/*
+ * Xen mmu operations
+ *
+ * This file contains the various mmu fetch and update operations.
+ * The most important job they must perform is the mapping between the
+ * domain's pfn and the overall machine mfns.
+ *
+ * Xen allows guests to directly update the pagetable, in a controlled
+ * fashion.  In other words, the guest modifies the same pagetable
+ * that the CPU actually uses, which eliminates the overhead of having
+ * a separate shadow pagetable.
+ *
+ * In order to allow this, it falls on the guest domain to map its
+ * notion of a "physical" pfn - which is just a domain-local linear
+ * address - into a real "machine address" which the CPU's MMU can
+ * use.
+ *
+ * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
+ * inserted directly into the pagetable.  When creating a new
+ * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
+ * when reading the content back with __(pgd|pmd|pte)_val, it converts
+ * the mfn back into a pfn.
+ *
+ * The other constraint is that all pages which make up a pagetable
+ * must be mapped read-only in the guest.  This prevents uncontrolled
+ * guest updates to the pagetable.  Xen strictly enforces this, and
+ * will disallow any pagetable update which will end up mapping a
+ * pagetable page RW, and will disallow using any writable page as a
+ * pagetable.
+ *
+ * Naively, when loading %cr3 with the base of a new pagetable, Xen
+ * would need to validate the whole pagetable before going on.
+ * Naturally, this is quite slow.  The solution is to "pin" a
+ * pagetable, which enforces all the constraints on the pagetable even
+ * when it is not actively in use.  This menas that Xen can be assured
+ * that it is still valid when you do load it into %cr3, and doesn't
+ * need to revalidate it.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/bug.h>
+#include <linux/sched.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/paravirt.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+#include "multicalls.h"
+#include "mmu.h"
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+	pte_t *pte = lookup_address(address);
+	unsigned offset = address & PAGE_MASK;
+
+	BUG_ON(pte == NULL);
+
+	return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_wrprotect(*pte);
+
+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+		BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+	pte_t *pte, ptev;
+	unsigned long address = (unsigned long)vaddr;
+
+	pte = lookup_address(address);
+	BUG_ON(pte == NULL);
+
+	ptev = pte_mkwrite(*pte);
+
+	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
+		BUG();
+}
+
+
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+	struct multicall_space mcs;
+	struct mmu_update *u;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+	u->ptr = virt_to_machine(ptr).maddr;
+	u->val = pmd_val_ma(val);
+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	/* <mfn,flags> stored as-is, to permit clearing entries */
+	xen_set_pte(pte, mfn_pte(mfn, flags));
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval)
+{
+	if (mm == current->mm || mm == &init_mm) {
+		if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
+			struct multicall_space mcs;
+			mcs = xen_mc_entry(0);
+
+			MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
+			xen_mc_issue(PARAVIRT_LAZY_MMU);
+			return;
+		} else
+			if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
+				return;
+	}
+	xen_set_pte(ptep, pteval);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+	struct multicall_space mcs;
+	struct mmu_update *u;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*u));
+	u = mcs.args;
+	u->ptr = virt_to_machine(ptr).maddr;
+	u->val = pud_val_ma(val);
+	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	ptep->pte_high = pte.pte_high;
+	smp_wmb();
+	ptep->pte_low = pte.pte_low;
+}
+
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+	set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+	ptep->pte_low = 0;
+	smp_wmb();		/* make sure low gets written first */
+	ptep->pte_high = 0;
+}
+
+void xen_pmd_clear(pmd_t *pmdp)
+{
+	xen_set_pmd(pmdp, __pmd(0));
+}
+
+unsigned long long xen_pte_val(pte_t pte)
+{
+	unsigned long long ret = 0;
+
+	if (pte.pte_low) {
+		ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	}
+
+	return ret;
+}
+
+unsigned long long xen_pmd_val(pmd_t pmd)
+{
+	unsigned long long ret = pmd.pmd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+unsigned long long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+pte_t xen_make_pte(unsigned long long pte)
+{
+	if (pte & 1)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte, pte >> 32 };
+}
+
+pmd_t xen_make_pmd(unsigned long long pmd)
+{
+	if (pmd & 1)
+		pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+	return (pmd_t){ pmd };
+}
+
+pgd_t xen_make_pgd(unsigned long long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+#else  /* !PAE */
+void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+	*ptep = pte;
+}
+
+unsigned long xen_pte_val(pte_t pte)
+{
+	unsigned long ret = pte.pte_low;
+
+	if (ret & _PAGE_PRESENT)
+		ret = machine_to_phys(XMADDR(ret)).paddr;
+
+	return ret;
+}
+
+unsigned long xen_pgd_val(pgd_t pgd)
+{
+	unsigned long ret = pgd.pgd;
+	if (ret)
+		ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+	return ret;
+}
+
+pte_t xen_make_pte(unsigned long pte)
+{
+	if (pte & _PAGE_PRESENT)
+		pte = phys_to_machine(XPADDR(pte)).maddr;
+
+	return (pte_t){ pte };
+}
+
+pgd_t xen_make_pgd(unsigned long pgd)
+{
+	if (pgd & _PAGE_PRESENT)
+		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+	return (pgd_t){ pgd };
+}
+#endif	/* CONFIG_X86_PAE */
+
+
+
+/*
+  (Yet another) pagetable walker.  This one is intended for pinning a
+  pagetable.  This means that it walks a pagetable and calls the
+  callback function on each page it finds making up the page table,
+  at every level.  It walks the entire pagetable, but it only bothers
+  pinning pte pages which are below pte_limit.  In the normal case
+  this will be TASK_SIZE, but at boot we need to pin up to
+  FIXADDR_TOP.  But the important bit is that we don't pin beyond
+  there, because then we start getting into Xen's ptes.
+*/
+static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
+		    unsigned long limit)
+{
+	pgd_t *pgd = pgd_base;
+	int flush = 0;
+	unsigned long addr = 0;
+	unsigned long pgd_next;
+
+	BUG_ON(limit > FIXADDR_TOP);
+
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
+	for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+		pud_t *pud;
+		unsigned long pud_limit, pud_next;
+
+		pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
+
+		if (!pgd_val(*pgd))
+			continue;
+
+		pud = pud_offset(pgd, 0);
+
+		if (PTRS_PER_PUD > 1) /* not folded */
+			flush |= (*func)(virt_to_page(pud), 0);
+
+		for (; addr != pud_limit; pud++, addr = pud_next) {
+			pmd_t *pmd;
+			unsigned long pmd_limit;
+
+			pud_next = pud_addr_end(addr, pud_limit);
+
+			if (pud_next < limit)
+				pmd_limit = pud_next;
+			else
+				pmd_limit = limit;
+
+			if (pud_none(*pud))
+				continue;
+
+			pmd = pmd_offset(pud, 0);
+
+			if (PTRS_PER_PMD > 1) /* not folded */
+				flush |= (*func)(virt_to_page(pmd), 0);
+
+			for (; addr != pmd_limit; pmd++) {
+				addr += (PAGE_SIZE * PTRS_PER_PTE);
+				if ((pmd_limit-1) < (addr-1)) {
+					addr = pmd_limit;
+					break;
+				}
+
+				if (pmd_none(*pmd))
+					continue;
+
+				flush |= (*func)(pmd_page(*pmd), 0);
+			}
+		}
+	}
+
+	flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH);
+
+	return flush;
+}
+
+static int pin_page(struct page *page, unsigned flags)
+{
+	unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+	int flush;
+
+	if (pgfl)
+		flush = 0;		/* already pinned */
+	else if (PageHighMem(page))
+		/* kmaps need flushing if we found an unpinned
+		   highpage */
+		flush = 1;
+	else {
+		void *pt = lowmem_page_address(page);
+		unsigned long pfn = page_to_pfn(page);
+		struct multicall_space mcs = __xen_mc_entry(0);
+
+		flush = 0;
+
+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+					pfn_pte(pfn, PAGE_KERNEL_RO),
+					flags);
+	}
+
+	return flush;
+}
+
+/* This is called just after a mm has been created, but it has not
+   been used yet.  We need to make sure that its pagetable is all
+   read-only, and can be pinned. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+	struct multicall_space mcs;
+	struct mmuext_op *op;
+
+	xen_mc_batch();
+
+	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+		/* re-enable interrupts for kmap_flush_unused */
+		xen_mc_issue(0);
+		kmap_flush_unused();
+		xen_mc_batch();
+	}
+
+	mcs = __xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+
+#ifdef CONFIG_X86_PAE
+	op->cmd = MMUEXT_PIN_L3_TABLE;
+#else
+	op->cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(0);
+}
+
+/* The init_mm pagetable is really pinned as soon as its created, but
+   that's before we have page structures to store the bits.  So do all
+   the book-keeping now. */
+static __init int mark_pinned(struct page *page, unsigned flags)
+{
+	SetPagePinned(page);
+	return 0;
+}
+
+void __init xen_mark_init_mm_pinned(void)
+{
+	pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
+}
+
+static int unpin_page(struct page *page, unsigned flags)
+{
+	unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+
+	if (pgfl && !PageHighMem(page)) {
+		void *pt = lowmem_page_address(page);
+		unsigned long pfn = page_to_pfn(page);
+		struct multicall_space mcs = __xen_mc_entry(0);
+
+		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
+					pfn_pte(pfn, PAGE_KERNEL),
+					flags);
+	}
+
+	return 0;		/* never need to flush on unpin */
+}
+
+/* Release a pagetables pages back as normal RW */
+static void xen_pgd_unpin(pgd_t *pgd)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	xen_mc_batch();
+
+	mcs = __xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_UNPIN_TABLE;
+	op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	pgd_walk(pgd, unpin_page, TASK_SIZE);
+
+	xen_mc_issue(0);
+}
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+	spin_lock(&next->page_table_lock);
+	xen_pgd_pin(next->pgd);
+	spin_unlock(&next->page_table_lock);
+}
+
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	spin_lock(&mm->page_table_lock);
+	xen_pgd_pin(mm->pgd);
+	spin_unlock(&mm->page_table_lock);
+}
+
+
+#ifdef CONFIG_SMP
+/* Another cpu may still have their %cr3 pointing at the pagetable, so
+   we need to repoint it somewhere else before we can unpin it. */
+static void drop_other_mm_ref(void *info)
+{
+	struct mm_struct *mm = info;
+
+	if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+		leave_mm(smp_processor_id());
+}
+
+static void drop_mm_ref(struct mm_struct *mm)
+{
+	if (current->active_mm == mm) {
+		if (current->mm == mm)
+			load_cr3(swapper_pg_dir);
+		else
+			leave_mm(smp_processor_id());
+	}
+
+	if (!cpus_empty(mm->cpu_vm_mask))
+		xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref,
+					   mm, 1);
+}
+#else
+static void drop_mm_ref(struct mm_struct *mm)
+{
+	if (current->active_mm == mm)
+		load_cr3(swapper_pg_dir);
+}
+#endif
+
+/*
+ * While a process runs, Xen pins its pagetables, which means that the
+ * hypervisor forces it to be read-only, and it controls all updates
+ * to it.  This means that all pagetable updates have to go via the
+ * hypervisor, which is moderately expensive.
+ *
+ * Since we're pulling the pagetable down, we switch to use init_mm,
+ * unpin old process pagetable and mark it all read-write, which
+ * allows further operations on it to be simple memory accesses.
+ *
+ * The only subtle point is that another CPU may be still using the
+ * pagetable because of lazy tlb flushing.  This means we need need to
+ * switch all CPUs off this pagetable before we can unpin it.
+ */
+void xen_exit_mmap(struct mm_struct *mm)
+{
+	get_cpu();		/* make sure we don't move around */
+	drop_mm_ref(mm);
+	put_cpu();
+
+	spin_lock(&mm->page_table_lock);
+	xen_pgd_unpin(mm->pgd);
+	spin_unlock(&mm->page_table_lock);
+}
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h
new file mode 100644
index 000000000000..c9ff27f3ac3a
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,60 @@
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void xen_set_pte(pte_t *ptep, pte_t pteval);
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+
+void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+void xen_exit_mmap(struct mm_struct *mm);
+
+void xen_pgd_pin(pgd_t *pgd);
+//void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+unsigned long long xen_pte_val(pte_t);
+unsigned long long xen_pmd_val(pmd_t);
+unsigned long long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long long);
+pmd_t xen_make_pmd(unsigned long long);
+pgd_t xen_make_pgd(unsigned long long);
+
+void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, pte_t pteval);
+void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+unsigned long xen_pte_val(pte_t);
+unsigned long xen_pmd_val(pmd_t);
+unsigned long xen_pgd_val(pgd_t);
+
+pte_t xen_make_pte(unsigned long);
+pmd_t xen_make_pmd(unsigned long);
+pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif	/* _XEN_MMU_H */
diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c
new file mode 100644
index 000000000000..c837e8e463db
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,90 @@
+/*
+ * Xen hypercall batching.
+ *
+ * Xen allows multiple hypercalls to be issued at once, using the
+ * multicall interface.  This allows the cost of trapping into the
+ * hypervisor to be amortized over several calls.
+ *
+ * This file implements a simple interface for multicalls.  There's a
+ * per-cpu buffer of outstanding multicalls.  When you want to queue a
+ * multicall for issuing, you can allocate a multicall slot for the
+ * call and its arguments, along with storage for space which is
+ * pointed to by the arguments (for passing pointers to structures,
+ * etc).  When the multicall is actually issued, all the space for the
+ * commands and allocated memory is freed for reuse.
+ *
+ * Multicalls are flushed whenever any of the buffers get full, or
+ * when explicitly requested.  There's no way to get per-multicall
+ * return results back.  It will BUG if any of the multicalls fail.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+
+#include <asm/xen/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH	32
+#define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))
+
+struct mc_buffer {
+	struct multicall_entry entries[MC_BATCH];
+	u64 args[MC_ARGS];
+	unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+void xen_mc_flush(void)
+{
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	int ret = 0;
+	unsigned long flags;
+
+	BUG_ON(preemptible());
+
+	/* Disable interrupts in case someone comes in and queues
+	   something in the middle */
+	local_irq_save(flags);
+
+	if (b->mcidx) {
+		int i;
+
+		if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+			BUG();
+		for (i = 0; i < b->mcidx; i++)
+			if (b->entries[i].result < 0)
+				ret++;
+		b->mcidx = 0;
+		b->argidx = 0;
+	} else
+		BUG_ON(b->argidx != 0);
+
+	local_irq_restore(flags);
+
+	BUG_ON(ret);
+}
+
+struct multicall_space __xen_mc_entry(size_t args)
+{
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	struct multicall_space ret;
+	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+	BUG_ON(preemptible());
+	BUG_ON(argspace > MC_ARGS);
+
+	if (b->mcidx == MC_BATCH ||
+	    (b->argidx + argspace) > MC_ARGS)
+		xen_mc_flush();
+
+	ret.mc = &b->entries[b->mcidx];
+	b->mcidx++;
+	ret.args = &b->args[b->argidx];
+	b->argidx += argspace;
+
+	return ret;
+}
diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h
new file mode 100644
index 000000000000..e6f7530b156c
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,45 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+#include "xen-ops.h"
+
+/* Multicalls */
+struct multicall_space
+{
+	struct multicall_entry *mc;
+	void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s.  Must be
+   paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+	/* need to disable interrupts until this entry is complete */
+	local_irq_save(__get_cpu_var(xen_mc_irq_flags));
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+	xen_mc_batch();
+	return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+	if ((xen_get_lazy_mode() & mode) == 0)
+		xen_mc_flush();
+
+	/* restore flags saved in xen_mc_batch */
+	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+}
+
+#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c
new file mode 100644
index 000000000000..f84e77226646
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,111 @@
+/*
+ * Machine specific setup for xen
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/pm.h>
+
+#include <asm/elf.h>
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+#include "xen-ops.h"
+#include "vdso.h"
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+	unsigned long max_pfn = xen_start_info->nr_pages;
+
+	e820.nr_map = 0;
+	add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+	return "Xen";
+}
+
+static void xen_idle(void)
+{
+	local_irq_disable();
+
+	if (need_resched())
+		local_irq_enable();
+	else {
+		current_thread_info()->status &= ~TS_POLLING;
+		smp_mb__after_clear_bit();
+		safe_halt();
+		current_thread_info()->status |= TS_POLLING;
+	}
+}
+
+/*
+ * Set the bit indicating "nosegneg" library variants should be used.
+ */
+static void fiddle_vdso(void)
+{
+	extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S.  */
+	extern char vsyscall_int80_start;
+	u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK +
+			     &vsyscall_int80_start);
+	*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+}
+
+void __init xen_arch_setup(void)
+{
+	struct physdev_set_iopl set_iopl;
+	int rc;
+
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+
+	if (!xen_feature(XENFEAT_auto_translated_physmap))
+		HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
+
+	HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
+				 __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+	set_iopl.iopl = 1;
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+	if (rc != 0)
+		printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+	if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+		printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+		disable_acpi();
+	}
+#endif
+
+	memcpy(boot_command_line, xen_start_info->cmd_line,
+	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+	pm_idle = xen_idle;
+
+#ifdef CONFIG_SMP
+	/* fill cpus_possible with all available cpus */
+	xen_fill_possible_map();
+#endif
+
+	paravirt_disable_iospace();
+
+	fiddle_vdso();
+}
diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c
new file mode 100644
index 000000000000..557b8e24706a
--- /dev/null
+++ b/arch/i386/xen/smp.c
@@ -0,0 +1,404 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ *
+ * This does not handle HOTPLUG_CPU yet.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/smp.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+static cpumask_t cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+	void (*func) (void *info);
+	void *info;
+	atomic_t started;
+	atomic_t finished;
+	int wait;
+};
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
+
+static struct call_data_struct *call_data;
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
+{
+	return IRQ_HANDLED;
+}
+
+static __cpuinit void cpu_bringup_and_idle(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_init();
+
+	preempt_disable();
+	per_cpu(cpu_state, cpu) = CPU_ONLINE;
+
+	xen_setup_cpu_clockevents();
+
+	/* We can take interrupts now: we're officially "up". */
+	local_irq_enable();
+
+	wmb();			/* make sure everything is out */
+	cpu_idle();
+}
+
+static int xen_smp_intr_init(unsigned int cpu)
+{
+	int rc;
+	const char *resched_name, *callfunc_name;
+
+	per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+
+	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
+				    cpu,
+				    xen_reschedule_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    resched_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(resched_irq, cpu) = rc;
+
+	callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
+				    cpu,
+				    xen_call_function_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    callfunc_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(callfunc_irq, cpu) = rc;
+
+	return 0;
+
+ fail:
+	if (per_cpu(resched_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
+	if (per_cpu(callfunc_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+	return rc;
+}
+
+void __init xen_fill_possible_map(void)
+{
+	int i, rc;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (rc >= 0)
+			cpu_set(i, cpu_possible_map);
+	}
+}
+
+void __init xen_smp_prepare_boot_cpu(void)
+{
+	int cpu;
+
+	BUG_ON(smp_processor_id() != 0);
+	native_smp_prepare_boot_cpu();
+
+	/* We've switched to the "real" per-cpu gdt, so make sure the
+	   old memory can be recycled */
+	make_lowmem_page_readwrite(&per_cpu__gdt_page);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+	}
+
+	xen_setup_vcpu_info_placement();
+}
+
+void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+	}
+
+	smp_store_cpu_info(0);
+	set_cpu_sibling_map(0);
+
+	if (xen_smp_intr_init(0))
+		BUG();
+
+	cpu_initialized_map = cpumask_of_cpu(0);
+
+	/* Restrict the possible_map according to max_cpus. */
+	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+		for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+			continue;
+		cpu_clear(cpu, cpu_possible_map);
+	}
+
+	for_each_possible_cpu (cpu) {
+		struct task_struct *idle;
+
+		if (cpu == 0)
+			continue;
+
+		idle = fork_idle(cpu);
+		if (IS_ERR(idle))
+			panic("failed fork for CPU %d", cpu);
+
+		cpu_set(cpu, cpu_present_map);
+	}
+
+	//init_xenbus_allowed_cpumask();
+}
+
+static __cpuinit int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+	struct vcpu_guest_context *ctxt;
+	struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+
+	if (cpu_test_and_set(cpu, cpu_initialized_map))
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (ctxt == NULL)
+		return -ENOMEM;
+
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.ds = __USER_DS;
+	ctxt->user_regs.es = __USER_DS;
+	ctxt->user_regs.fs = __KERNEL_PERCPU;
+	ctxt->user_regs.gs = 0;
+	ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+
+	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+	xen_copy_trap_info(ctxt->trap_ctxt);
+
+	ctxt->ldt_ents = 0;
+
+	BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
+	make_lowmem_page_readonly(gdt->gdt);
+
+	ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
+	ctxt->gdt_ents      = ARRAY_SIZE(gdt->gdt);
+
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
+
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.esp0;
+
+	ctxt->event_callback_cs     = __KERNEL_CS;
+	ctxt->event_callback_eip    = (unsigned long)xen_hypervisor_callback;
+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
+	ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
+
+	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
+		BUG();
+
+	kfree(ctxt);
+	return 0;
+}
+
+int __cpuinit xen_cpu_up(unsigned int cpu)
+{
+	struct task_struct *idle = idle_task(cpu);
+	int rc;
+
+#if 0
+	rc = cpu_up_check(cpu);
+	if (rc)
+		return rc;
+#endif
+
+	init_gdt(cpu);
+	per_cpu(current_task, cpu) = idle;
+	irq_ctx_init(cpu);
+	xen_setup_timer(cpu);
+
+	/* make sure interrupts start blocked */
+	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+	rc = cpu_initialize_context(cpu, idle);
+	if (rc)
+		return rc;
+
+	if (num_online_cpus() == 1)
+		alternatives_smp_switch(1);
+
+	rc = xen_smp_intr_init(cpu);
+	if (rc)
+		return rc;
+
+	smp_store_cpu_info(cpu);
+	set_cpu_sibling_map(cpu);
+	/* This must be done before setting cpu_online_map */
+	wmb();
+
+	cpu_set(cpu, cpu_online_map);
+
+	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+	BUG_ON(rc);
+
+	return 0;
+}
+
+void xen_smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+static void stop_self(void *v)
+{
+	int cpu = smp_processor_id();
+
+	/* make sure we're not pinning something down */
+	load_cr3(swapper_pg_dir);
+	/* should set up a minimal gdt */
+
+	HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+	BUG();
+}
+
+void xen_smp_send_stop(void)
+{
+	smp_call_function(stop_self, NULL, 0, 0);
+}
+
+void xen_smp_send_reschedule(int cpu)
+{
+	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
+}
+
+
+static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+{
+	unsigned cpu;
+
+	cpus_and(mask, mask, cpu_online_map);
+
+	for_each_cpu_mask(cpu, mask)
+		xen_send_IPI_one(cpu, vector);
+}
+
+static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
+{
+	void (*func) (void *info) = call_data->func;
+	void *info = call_data->info;
+	int wait = call_data->wait;
+
+	/*
+	 * Notify initiating CPU that I've grabbed the data and am
+	 * about to execute the function
+	 */
+	mb();
+	atomic_inc(&call_data->started);
+	/*
+	 * At this point the info structure may be out of scope unless wait==1
+	 */
+	irq_enter();
+	(*func)(info);
+	irq_exit();
+
+	if (wait) {
+		mb();		/* commit everything before setting finished */
+		atomic_inc(&call_data->finished);
+	}
+
+	return IRQ_HANDLED;
+}
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+			       void *info, int wait)
+{
+	struct call_data_struct data;
+	int cpus;
+
+	/* Holding any lock stops cpus from going down. */
+	spin_lock(&call_lock);
+
+	cpu_clear(smp_processor_id(), mask);
+
+	cpus = cpus_weight(mask);
+	if (!cpus) {
+		spin_unlock(&call_lock);
+		return 0;
+	}
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	data.func = func;
+	data.info = info;
+	atomic_set(&data.started, 0);
+	data.wait = wait;
+	if (wait)
+		atomic_set(&data.finished, 0);
+
+	call_data = &data;
+	mb();			/* write everything before IPI */
+
+	/* Send a message to other CPUs and wait for them to respond */
+	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+	/* Make sure other vcpus get a chance to run.
+	   XXX too severe?  Maybe we should check the other CPU's states? */
+	HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+
+	/* Wait for response */
+	while (atomic_read(&data.started) != cpus ||
+	       (wait && atomic_read(&data.finished) != cpus))
+		cpu_relax();
+
+	spin_unlock(&call_lock);
+
+	return 0;
+}
diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c
new file mode 100644
index 000000000000..dfd6db69ead5
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,593 @@
+/*
+ * Xen time implementation.
+ *
+ * This is implemented in terms of a clocksource driver which uses
+ * the hypervisor clock as a nanosecond timebase, and a clockevent
+ * driver which uses the hypervisor's timer mechanism.
+ *
+ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+ */
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP	100000
+#define NS_PER_TICK	(1000000000LL / HZ)
+
+static cycle_t xen_clocksource_read(void);
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+	u32 tsc_to_nsec_mul;
+	int tsc_shift;
+	u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+/* runstate info updated by Xen */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
+
+/* unused ns of stolen and blocked time */
+static DEFINE_PER_CPU(u64, residual_stolen);
+static DEFINE_PER_CPU(u64, residual_blocked);
+
+/* return an consistent snapshot of 64-bit time/counter value */
+static u64 get64(const u64 *p)
+{
+	u64 ret;
+
+	if (BITS_PER_LONG < 64) {
+		u32 *p32 = (u32 *)p;
+		u32 h, l;
+
+		/*
+		 * Read high then low, and then make sure high is
+		 * still the same; this will only loop if low wraps
+		 * and carries into high.
+		 * XXX some clean way to make this endian-proof?
+		 */
+		do {
+			h = p32[1];
+			barrier();
+			l = p32[0];
+			barrier();
+		} while (p32[1] != h);
+
+		ret = (((u64)h) << 32) | l;
+	} else
+		ret = *p;
+
+	return ret;
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+	u64 state_time;
+	struct vcpu_runstate_info *state;
+
+	BUG_ON(preemptible());
+
+	state = &__get_cpu_var(runstate);
+
+	/*
+	 * The runstate info is always updated by the hypervisor on
+	 * the current CPU, so there's no need to use anything
+	 * stronger than a compiler barrier when fetching it.
+	 */
+	do {
+		state_time = get64(&state->state_entry_time);
+		barrier();
+		*res = *state;
+		barrier();
+	} while (get64(&state->state_entry_time) != state_time);
+}
+
+static void setup_runstate_info(int cpu)
+{
+	struct vcpu_register_runstate_memory_area area;
+
+	area.addr.v = &per_cpu(runstate, cpu);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+			       cpu, &area))
+		BUG();
+}
+
+static void do_stolen_accounting(void)
+{
+	struct vcpu_runstate_info state;
+	struct vcpu_runstate_info *snap;
+	s64 blocked, runnable, offline, stolen;
+	cputime_t ticks;
+
+	get_runstate_snapshot(&state);
+
+	WARN_ON(state.state != RUNSTATE_running);
+
+	snap = &__get_cpu_var(runstate_snapshot);
+
+	/* work out how much time the VCPU has not been runn*ing*  */
+	blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
+	runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
+	offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
+
+	*snap = state;
+
+	/* Add the appropriate number of ticks of stolen time,
+	   including any left-overs from last time.  Passing NULL to
+	   account_steal_time accounts the time as stolen. */
+	stolen = runnable + offline + __get_cpu_var(residual_stolen);
+
+	if (stolen < 0)
+		stolen = 0;
+
+	ticks = 0;
+	while (stolen >= NS_PER_TICK) {
+		ticks++;
+		stolen -= NS_PER_TICK;
+	}
+	__get_cpu_var(residual_stolen) = stolen;
+	account_steal_time(NULL, ticks);
+
+	/* Add the appropriate number of ticks of blocked time,
+	   including any left-overs from last time.  Passing idle to
+	   account_steal_time accounts the time as idle/wait. */
+	blocked += __get_cpu_var(residual_blocked);
+
+	if (blocked < 0)
+		blocked = 0;
+
+	ticks = 0;
+	while (blocked >= NS_PER_TICK) {
+		ticks++;
+		blocked -= NS_PER_TICK;
+	}
+	__get_cpu_var(residual_blocked) = blocked;
+	account_steal_time(idle_task(smp_processor_id()), ticks);
+}
+
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long xen_sched_clock(void)
+{
+	struct vcpu_runstate_info state;
+	cycle_t now;
+	u64 ret;
+	s64 offset;
+
+	/*
+	 * Ideally sched_clock should be called on a per-cpu basis
+	 * anyway, so preempt should already be disabled, but that's
+	 * not current practice at the moment.
+	 */
+	preempt_disable();
+
+	now = xen_clocksource_read();
+
+	get_runstate_snapshot(&state);
+
+	WARN_ON(state.state != RUNSTATE_running);
+
+	offset = now - state.state_entry_time;
+	if (offset < 0)
+		offset = 0;
+
+	ret = state.time[RUNSTATE_blocked] +
+		state.time[RUNSTATE_running] +
+		offset;
+
+	preempt_enable();
+
+	return ret;
+}
+
+
+/* Get the CPU speed from Xen */
+unsigned long xen_cpu_khz(void)
+{
+	u64 cpu_khz = 1000000ULL << 32;
+	const struct vcpu_time_info *info =
+		&HYPERVISOR_shared_info->vcpu_info[0].time;
+
+	do_div(cpu_khz, info->tsc_to_system_mul);
+	if (info->tsc_shift < 0)
+		cpu_khz <<= -info->tsc_shift;
+	else
+		cpu_khz >>= info->tsc_shift;
+
+	return cpu_khz;
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static unsigned get_time_values_from_xen(void)
+{
+	struct vcpu_time_info   *src;
+	struct shadow_time_info *dst;
+
+	/* src is shared memory with the hypervisor, so we need to
+	   make sure we get a consistent snapshot, even in the face of
+	   being preempted. */
+	src = &__get_cpu_var(xen_vcpu)->time;
+	dst = &__get_cpu_var(shadow_time);
+
+	do {
+		dst->version = src->version;
+		rmb();		/* fetch version before data */
+		dst->tsc_timestamp     = src->tsc_timestamp;
+		dst->system_timestamp  = src->system_time;
+		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+		dst->tsc_shift         = src->tsc_shift;
+		rmb();		/* test version after fetching data */
+	} while ((src->version & 1) | (dst->version ^ src->version));
+
+	return dst->version;
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+	u64 now, delta;
+	now = native_read_tsc();
+	delta = now - shadow->tsc_timestamp;
+	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static cycle_t xen_clocksource_read(void)
+{
+	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+	cycle_t ret;
+	unsigned version;
+
+	do {
+		version = get_time_values_from_xen();
+		barrier();
+		ret = shadow->system_timestamp + get_nsec_offset(shadow);
+		barrier();
+	} while (version != __get_cpu_var(xen_vcpu)->time.version);
+
+	put_cpu_var(shadow_time);
+
+	return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+	const struct shared_info *s = HYPERVISOR_shared_info;
+	u32 version;
+	u64 delta;
+	struct timespec now;
+
+	/* get wallclock at system boot */
+	do {
+		version = s->wc_version;
+		rmb();		/* fetch version before time */
+		now.tv_sec  = s->wc_sec;
+		now.tv_nsec = s->wc_nsec;
+		rmb();		/* fetch time before checking version */
+	} while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+	delta = xen_clocksource_read();	/* time since system boot */
+	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+	now.tv_sec = delta;
+
+	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+	struct timespec ts;
+
+	xen_read_wallclock(&ts);
+
+	return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+	/* do nothing for domU */
+	return -1;
+}
+
+static struct clocksource xen_clocksource __read_mostly = {
+	.name = "xen",
+	.rating = 400,
+	.read = xen_clocksource_read,
+	.mask = ~0,
+	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
+	.shift = XEN_SHIFT,
+	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+/*
+   Xen clockevent implementation
+
+   Xen has two clockevent implementations:
+
+   The old timer_op one works with all released versions of Xen prior
+   to version 3.0.4.  This version of the hypervisor provides a
+   single-shot timer with nanosecond resolution.  However, sharing the
+   same event channel is a 100Hz tick which is delivered while the
+   vcpu is running.  We don't care about or use this tick, but it will
+   cause the core time code to think the timer fired too soon, and
+   will end up resetting it each time.  It could be filtered, but
+   doing so has complications when the ktime clocksource is not yet
+   the xen clocksource (ie, at boot time).
+
+   The new vcpu_op-based timer interface allows the tick timer period
+   to be changed or turned off.  The tick timer is not useful as a
+   periodic timer because events are only delivered to running vcpus.
+   The one-shot timer can report when a timeout is in the past, so
+   set_next_event is capable of returning -ETIME when appropriate.
+   This interface is used when available.
+*/
+
+
+/*
+  Get a hypervisor absolute time.  In theory we could maintain an
+  offset between the kernel's time and the hypervisor's time, and
+  apply that to a kernel's absolute timeout.  Unfortunately the
+  hypervisor and kernel times can drift even if the kernel is using
+  the Xen clocksource, because ntp can warp the kernel's clocksource.
+*/
+static s64 get_abs_timeout(unsigned long delta)
+{
+	return xen_clocksource_read() + delta;
+}
+
+static void xen_timerop_set_mode(enum clock_event_mode mode,
+				 struct clock_event_device *evt)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		/* unsupported */
+		WARN_ON(1);
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		HYPERVISOR_set_timer_op(0);  /* cancel timeout */
+		break;
+	}
+}
+
+static int xen_timerop_set_next_event(unsigned long delta,
+				      struct clock_event_device *evt)
+{
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+		BUG();
+
+	/* We may have missed the deadline, but there's no real way of
+	   knowing for sure.  If the event was in the past, then we'll
+	   get an immediate interrupt. */
+
+	return 0;
+}
+
+static const struct clock_event_device xen_timerop_clockevent = {
+	.name = "xen",
+	.features = CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns = 0xffffffff,
+	.min_delta_ns = TIMER_SLOP,
+
+	.mult = 1,
+	.shift = 0,
+	.rating = 500,
+
+	.set_mode = xen_timerop_set_mode,
+	.set_next_event = xen_timerop_set_next_event,
+};
+
+
+
+static void xen_vcpuop_set_mode(enum clock_event_mode mode,
+				struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		WARN_ON(1);	/* unsupported */
+		break;
+
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+		break;
+
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+		    HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+		break;
+	case CLOCK_EVT_MODE_RESUME:
+		break;
+	}
+}
+
+static int xen_vcpuop_set_next_event(unsigned long delta,
+				     struct clock_event_device *evt)
+{
+	int cpu = smp_processor_id();
+	struct vcpu_set_singleshot_timer single;
+	int ret;
+
+	WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	single.timeout_abs_ns = get_abs_timeout(delta);
+	single.flags = VCPU_SSHOTTMR_future;
+
+	ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
+
+	BUG_ON(ret != 0 && ret != -ETIME);
+
+	return ret;
+}
+
+static const struct clock_event_device xen_vcpuop_clockevent = {
+	.name = "xen",
+	.features = CLOCK_EVT_FEAT_ONESHOT,
+
+	.max_delta_ns = 0xffffffff,
+	.min_delta_ns = TIMER_SLOP,
+
+	.mult = 1,
+	.shift = 0,
+	.rating = 500,
+
+	.set_mode = xen_vcpuop_set_mode,
+	.set_next_event = xen_vcpuop_set_next_event,
+};
+
+static const struct clock_event_device *xen_clockevent =
+	&xen_timerop_clockevent;
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
+	irqreturn_t ret;
+
+	ret = IRQ_NONE;
+	if (evt->event_handler) {
+		evt->event_handler(evt);
+		ret = IRQ_HANDLED;
+	}
+
+	do_stolen_accounting();
+
+	return ret;
+}
+
+void xen_setup_timer(int cpu)
+{
+	const char *name;
+	struct clock_event_device *evt;
+	int irq;
+
+	printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
+
+	name = kasprintf(GFP_KERNEL, "timer%d", cpu);
+	if (!name)
+		name = "<timer kasprintf failed>";
+
+	irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
+				      IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				      name, NULL);
+
+	evt = &per_cpu(xen_clock_events, cpu);
+	memcpy(evt, xen_clockevent, sizeof(*evt));
+
+	evt->cpumask = cpumask_of_cpu(cpu);
+	evt->irq = irq;
+
+	setup_runstate_info(cpu);
+}
+
+void xen_setup_cpu_clockevents(void)
+{
+	BUG_ON(preemptible());
+
+	clockevents_register_device(&__get_cpu_var(xen_clock_events));
+}
+
+__init void xen_time_init(void)
+{
+	int cpu = smp_processor_id();
+
+	get_time_values_from_xen();
+
+	clocksource_register(&xen_clocksource);
+
+	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
+		/* Successfully turned off 100Hz tick, so we have the
+		   vcpuop-based timer interface */
+		printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
+		xen_clockevent = &xen_vcpuop_clockevent;
+	}
+
+	/* Set initial system time with full resolution */
+	xen_read_wallclock(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+	tsc_disable = 0;
+
+	xen_setup_timer(cpu);
+	xen_setup_cpu_clockevents();
+}
diff --git a/arch/i386/xen/vdso.h b/arch/i386/xen/vdso.h
new file mode 100644
index 000000000000..861fedfe5230
--- /dev/null
+++ b/arch/i386/xen/vdso.h
@@ -0,0 +1,4 @@
+/* Bit used for the pseudo-hwcap for non-negative segments.  We use
+   bit 1 to avoid bugs in some versions of glibc when bit 0 is
+   used; the choice is otherwise arbitrary. */
+#define VDSO_NOTE_NONEGSEG_BIT	1
diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S
new file mode 100644
index 000000000000..1a43b60c0c62
--- /dev/null
+++ b/arch/i386/xen/xen-asm.S
@@ -0,0 +1,291 @@
+/*
+	Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+	The inline versions are the same as the direct-use versions, with the
+	pre- and post-amble chopped off.
+
+	This code is encoded for size rather than absolute efficiency,
+	with a view to being able to inline as much as possible.
+
+	We only bother with direct forms (ie, vcpu in pda) of the operations
+	here; the indirect forms are better handled in C, since they're
+	generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v)	.globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x)	.globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI	0x80000000
+
+/*
+	Enable events.  This clears the event mask and tests the pending
+	event status with one and operation.  If there are pending
+	events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+	/* Clear mask and test pending */
+	andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+	ret
+	ENDPROC(xen_irq_enable_direct)
+	RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+	Disabling events is simply a matter of making the event mask
+	non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+	movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+	ret
+	ENDPROC(xen_irq_disable_direct)
+	RELOC(xen_irq_disable_direct, 0)
+
+/*
+	(xen_)save_fl is used to get the current interrupt enable status.
+	Callers expect the status to be in X86_EFLAGS_IF, and other bits
+	may be set in the return value.  We take advantage of this by
+	making sure that X86_EFLAGS_IF has the right value (and other bits
+	in that byte are 0), but other bits in the return value are
+	undefined.  We need to toggle the state of the bit, because
+	Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+	testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+	setz %ah
+	addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+	ret
+	ENDPROC(xen_save_fl_direct)
+	RELOC(xen_save_fl_direct, 0)
+
+
+/*
+	In principle the caller should be passing us a value return
+	from xen_save_fl_direct, but for robustness sake we test only
+	the X86_EFLAGS_IF flag rather than the whole byte. After
+	setting the interrupt mask state, it checks for unmasked
+	pending events and enters the hypervisor to get them delivered
+	if so.
+ */
+ENTRY(xen_restore_fl_direct)
+	testb $X86_EFLAGS_IF>>8, %ah
+	setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+	/* Preempt here doesn't matter because that will deal with
+	   any pending interrupts.  The pending check may end up being
+	   run on the wrong CPU, but that doesn't hurt. */
+
+	/* check for unmasked and pending */
+	cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+	jz 1f
+2:	call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+	ret
+	ENDPROC(xen_restore_fl_direct)
+	RELOC(xen_restore_fl_direct, 2b+1)
+
+/*
+	This is run where a normal iret would be run, with the same stack setup:
+	      8: eflags
+	      4: cs
+	esp-> 0: eip
+
+	This attempts to make sure that any pending events are dealt
+	with on return to usermode, but there is a small window in
+	which an event can happen just before entering usermode.  If
+	the nested interrupt ends up setting one of the TIF_WORK_MASK
+	pending work flags, they will not be tested again before
+	returning to usermode. This means that a process can end up
+	with pending work, which will be unprocessed until the process
+	enters and leaves the kernel again, which could be an
+	unbounded amount of time.  This means that a pending signal or
+	reschedule event could be indefinitely delayed.
+
+	The fix is to notice a nested interrupt in the critical
+	window, and if one occurs, then fold the nested interrupt into
+	the current interrupt stack frame, and re-process it
+	iteratively rather than recursively.  This means that it will
+	exit via the normal path, and all pending work will be dealt
+	with appropriately.
+
+	Because the nested interrupt handler needs to deal with the
+	current stack state in whatever form its in, we keep things
+	simple by only using a single register which is pushed/popped
+	on the stack.
+
+	Non-direct iret could be done in the same way, but it would
+	require an annoying amount of code duplication.  We'll assume
+	that direct mode will be the common case once the hypervisor
+	support becomes commonplace.
+ */
+ENTRY(xen_iret_direct)
+	/* test eflags for special cases */
+	testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+	jnz hyper_iret
+
+	push %eax
+	ESP_OFFSET=4	# bytes pushed onto stack
+
+	/* Store vcpu_info pointer for easy access.  Do it this
+	   way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+	GET_THREAD_INFO(%eax)
+	movl TI_cpu(%eax),%eax
+	movl __per_cpu_offset(,%eax,4),%eax
+	lea per_cpu__xen_vcpu_info(%eax),%eax
+#else
+	movl $per_cpu__xen_vcpu_info, %eax
+#endif
+
+	/* check IF state we're restoring */
+	testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+	/* Maybe enable events.  Once this happens we could get a
+	   recursive event, so the critical region starts immediately
+	   afterwards.  However, if that happens we don't end up
+	   resuming the code, so we don't have to be worried about
+	   being preempted to another CPU. */
+	setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+	/* check for unmasked and pending */
+	cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+	/* If there's something pending, mask events again so we
+	   can jump back into xen_hypervisor_callback */
+	sete XEN_vcpu_info_mask(%eax)
+
+	popl %eax
+
+	/* From this point on the registers are restored and the stack
+	   updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+	/* Jump to hypervisor_callback after fixing up the stack.
+	   Events are masked, so jumping out of the critical
+	   region is OK. */
+	je xen_hypervisor_callback
+
+	iret
+xen_iret_end_crit:
+
+hyper_iret:
+	/* put this out of line since its very rarely used */
+	jmp hypercall_page + __HYPERVISOR_iret * 32
+
+	.globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+   This is called by xen_hypervisor_callback in entry.S when it sees
+   that the EIP at the time of interrupt was between xen_iret_start_crit
+   and xen_iret_end_crit.  We're passed the EIP in %eax so we can do
+   a more refined determination of what to do.
+
+   The stack format at this point is:
+	----------------
+	 ss		: (ss/esp may be present if we came from usermode)
+	 esp		:
+	 eflags		}  outer exception info
+	 cs		}
+	 eip		}
+	---------------- <- edi (copy dest)
+	 eax		:  outer eax if it hasn't been restored
+	----------------
+	 eflags		}  nested exception info
+	 cs		}   (no ss/esp because we're nested
+	 eip		}    from the same ring)
+	 orig_eax	}<- esi (copy src)
+	 - - - - - - - -
+	 fs		}
+	 es		}
+	 ds		}  SAVE_ALL state
+	 eax		}
+	  :		:
+	 ebx		}
+	----------------
+	 return addr	 <- esp
+	----------------
+
+   In order to deliver the nested exception properly, we need to shift
+   everything from the return addr up to the error code so it
+   sits just under the outer exception info.  This means that when we
+   handle the exception, we do it in the context of the outer exception
+   rather than starting a new one.
+
+   The only caveat is that if the outer eax hasn't been
+   restored yet (ie, it's still on stack), we need to insert
+   its value into the SAVE_ALL state before going on, since
+   it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+	/* offsets +4 for return address */
+
+	/*
+	   Paranoia: Make sure we're really coming from userspace.
+	   One could imagine a case where userspace jumps into the
+	   critical range address, but just before the CPU delivers a GP,
+	   it decides to deliver an interrupt instead.  Unlikely?
+	   Definitely.  Easy to avoid?  Yes.  The Intel documents
+	   explicitly say that the reported EIP for a bad jump is the
+	   jump instruction itself, not the destination, but some virtual
+	   environments get this wrong.
+	 */
+	movl PT_CS+4(%esp), %ecx
+	andl $SEGMENT_RPL_MASK, %ecx
+	cmpl $USER_RPL, %ecx
+	je 2f
+
+	lea PT_ORIG_EAX+4(%esp), %esi
+	lea PT_EFLAGS+4(%esp), %edi
+
+	/* If eip is before iret_restore_end then stack
+	   hasn't been restored yet. */
+	cmp $iret_restore_end, %eax
+	jae 1f
+
+	movl 0+4(%edi),%eax		/* copy EAX */
+	movl %eax, PT_EAX+4(%esp)
+
+	lea ESP_OFFSET(%edi),%edi	/* move dest up over saved regs */
+
+	/* set up the copy */
+1:	std
+	mov $(PT_EIP+4) / 4, %ecx	/* copy ret+saved regs up to orig_eax */
+	rep movsl
+	cld
+
+	lea 4(%edi),%esp		/* point esp to new frame */
+2:	ret
+
+
+/*
+	Force an event check by making a hypercall,
+	but preserve regs before making the call.
+ */
+check_events:
+	push %eax
+	push %ecx
+	push %edx
+	call force_evtchn_callback
+	pop %edx
+	pop %ecx
+	pop %eax
+	ret
diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S
new file mode 100644
index 000000000000..bc71f3bc4014
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,38 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+	place in head.S */
+
+#ifdef CONFIG_XEN
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+	.section .init.text
+ENTRY(startup_xen)
+	movl %esi,xen_start_info
+	cld
+	movl $(init_thread_union+THREAD_SIZE),%esp
+	jmp xen_start_kernel
+
+.pushsection ".bss.page_aligned"
+	.align PAGE_SIZE_asm
+ENTRY(hypercall_page)
+	.skip 0x1000
+.popsection
+
+	.section .text
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+	ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+	ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long  __PAGE_OFFSET)
+	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
+	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
+	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+#else
+	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+#endif
+	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+
+#endif /*CONFIG_XEN */
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h
new file mode 100644
index 000000000000..b9aaea45f07f
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,71 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+void xen_copy_trap_info(struct trap_info *traps);
+
+DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+DECLARE_PER_CPU(unsigned long, xen_cr3);
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+void xen_setup_timer(int cpu);
+void xen_setup_cpu_clockevents(void);
+unsigned long xen_cpu_khz(void);
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+unsigned long long xen_sched_clock(void);
+
+void xen_mark_init_mm_pinned(void);
+
+DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
+
+static inline unsigned xen_get_lazy_mode(void)
+{
+	return x86_read_percpu(xen_lazy_mode);
+}
+
+void __init xen_fill_possible_map(void);
+
+void __init xen_setup_vcpu_info_placement(void);
+void xen_smp_prepare_boot_cpu(void);
+void xen_smp_prepare_cpus(unsigned int max_cpus);
+int xen_cpu_up(unsigned int cpu);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_stop(void);
+void xen_smp_send_reschedule(int cpu);
+int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
+			   int wait);
+int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+				 int nonatomic, int wait);
+
+int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
+			       void *info, int wait);
+
+
+/* Declare an asm function, along with symbols needed to make it
+   inlineable */
+#define DECL_ASM(ret, name, ...)		\
+	ret name(__VA_ARGS__);			\
+	extern char name##_end[];		\
+	extern char name##_reloc[]		\
+
+DECL_ASM(void, xen_irq_enable_direct, void);
+DECL_ASM(void, xen_irq_disable_direct, void);
+DECL_ASM(unsigned long, xen_save_fl_direct, void);
+DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+
+void xen_iret_direct(void);
+#endif /* XEN_OPS_H */