diff options
Diffstat (limited to 'arch/i386')
101 files changed, 5693 insertions, 640 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index b1b2b30b1b8e..abb582bc218f 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -18,6 +18,10 @@ config GENERIC_TIME bool default y +config GENERIC_CMOS_UPDATE + bool + default y + config CLOCKSOURCE_WATCHDOG bool default y @@ -222,6 +226,8 @@ config PARAVIRT However, when run without a hypervisor the kernel is theoretically slower. If in doubt, say N. +source "arch/i386/xen/Kconfig" + config VMI bool "VMI Paravirt-ops support" depends on PARAVIRT @@ -542,6 +548,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" depends on !M386 && !M486 + select X86_PAE help Select this if you have a 32-bit processor and more than 4 gigabytes of physical RAM. @@ -571,12 +578,12 @@ choice config VMSPLIT_3G bool "3G/1G user/kernel split" config VMSPLIT_3G_OPT - depends on !HIGHMEM + depends on !X86_PAE bool "3G/1G user/kernel split (for full 1G low memory)" config VMSPLIT_2G bool "2G/2G user/kernel split" config VMSPLIT_2G_OPT - depends on !HIGHMEM + depends on !X86_PAE bool "2G/2G user/kernel split (for full 2G low memory)" config VMSPLIT_1G bool "1G/3G user/kernel split" @@ -596,10 +603,15 @@ config HIGHMEM default y config X86_PAE - bool - depends on HIGHMEM64G - default y + bool "PAE (Physical Address Extension) Support" + default n + depends on !HIGHMEM4G select RESOURCES_64BIT + help + PAE is required for NX support, and furthermore enables + larger swapspace support for non-overcommit purposes. It + has the cost of more pagetable lookup overhead, and also + consumes more pagetable space per process. # Common NUMA Features config NUMA @@ -815,6 +827,7 @@ config CRASH_DUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) + default "0x1000000" if X86_NUMAQ default "0x100000" help This gives the physical address where the kernel is loaded. @@ -1212,21 +1225,26 @@ source "drivers/Kconfig" source "fs/Kconfig" -menu "Instrumentation Support" +menuconfig INSTRUMENTATION + bool "Instrumentation Support" depends on EXPERIMENTAL + default y + +if INSTRUMENTATION source "arch/i386/oprofile/Kconfig" config KPROBES - bool "Kprobes (EXPERIMENTAL)" - depends on KALLSYMS && EXPERIMENTAL && MODULES + bool "Kprobes" + depends on KALLSYMS && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes a probepoint and specifies the callback. Kprobes is useful for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". -endmenu + +endif # INSTRUMENTATION source "arch/i386/Kconfig.debug" diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index 9cbe76c3aa35..11a24d54f27b 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -297,11 +297,6 @@ config X86_POPAD_OK depends on !M386 default y -config X86_CMPXCHG64 - bool - depends on X86_PAE - default y - config X86_ALIGNMENT_16 bool depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 diff --git a/arch/i386/Makefile b/arch/i386/Makefile index bd28f9f9b4b7..01f0ff0daaf4 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -93,6 +93,9 @@ mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 mcore-$(CONFIG_X86_ES7000) := mach-default core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/i386/xen/ + # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default @@ -108,6 +111,7 @@ drivers-$(CONFIG_PCI) += arch/i386/pci/ # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/ drivers-$(CONFIG_PM) += arch/i386/power/ +drivers-$(CONFIG_FB) += arch/i386/video/ CFLAGS += $(mflags-y) AFLAGS += $(mflags-y) diff --git a/arch/i386/boot/.gitignore b/arch/i386/boot/.gitignore index 495f20c085de..18465143cfa2 100644 --- a/arch/i386/boot/.gitignore +++ b/arch/i386/boot/.gitignore @@ -1,3 +1,5 @@ bootsect bzImage setup +setup.bin +setup.elf diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index 08678a0a3d19..93386a4e40b4 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -39,7 +39,7 @@ setup-y += printf.o string.o tty.o video.o version.o voyager.o setup-y += video-vga.o setup-y += video-vesa.o setup-y += video-bios.o - +targets += $(setup-y) hostprogs-y := tools/build HOSTCFLAGS_build.o := $(LINUXINCLUDE) diff --git a/arch/i386/boot/boot.h b/arch/i386/boot/boot.h index 0329c4fe4f88..dec70c9b6050 100644 --- a/arch/i386/boot/boot.h +++ b/arch/i386/boot/boot.h @@ -56,7 +56,7 @@ static inline u16 inw(u16 port) static inline void outl(u32 v, u16 port) { - asm volatile("outl %0,%1" : : "a" (v), "dn" (port)); + asm volatile("outl %0,%1" : : "a" (v), "dN" (port)); } static inline u32 inl(u32 port) { diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c index ce4fda261aaf..2d77ee728f92 100644 --- a/arch/i386/boot/compressed/relocs.c +++ b/arch/i386/boot/compressed/relocs.c @@ -31,6 +31,9 @@ static const char* safe_abs_relocs[] = { "__kernel_rt_sigreturn", "__kernel_sigreturn", "SYSENTER_RETURN", + "VDSO_NOTE_MASK", + "xen_irq_disable_direct_reloc", + "xen_save_fl_direct_reloc", }; static int is_safe_abs_reloc(const char* sym_name) diff --git a/arch/i386/boot/cpucheck.c b/arch/i386/boot/cpucheck.c index 8b0f4473b083..991e8ceae1de 100644 --- a/arch/i386/boot/cpucheck.c +++ b/arch/i386/boot/cpucheck.c @@ -115,8 +115,8 @@ static int has_eflag(u32 mask) "pushfl ; " "popl %1 ; " "popfl" - : "=r" (f0), "=r" (f1) - : "g" (mask)); + : "=&r" (f0), "=&r" (f1) + : "ri" (mask)); return !!((f0^f1) & mask); } diff --git a/arch/i386/boot/mca.c b/arch/i386/boot/mca.c index 9b68bd1aef19..68222f2d4b67 100644 --- a/arch/i386/boot/mca.c +++ b/arch/i386/boot/mca.c @@ -26,7 +26,7 @@ int query_mca(void) "setc %0 ; " "movw %%es, %1 ; " "popw %%es" - : "=acdSDm" (err), "=acdSDm" (es), "=b" (bx) + : "=acd" (err), "=acdSD" (es), "=b" (bx) : "a" (0xc000)); if (err) diff --git a/arch/i386/boot/pm.c b/arch/i386/boot/pm.c index 3fa53e15ed77..1df025c73261 100644 --- a/arch/i386/boot/pm.c +++ b/arch/i386/boot/pm.c @@ -65,7 +65,7 @@ static void move_kernel_around(void) "popw %%ds ; " "popw %%es" : "+c" (dwords) - : "rm" (dst_seg), "rm" (src_seg) + : "r" (dst_seg), "r" (src_seg) : "esi", "edi"); syssize -= paras; diff --git a/arch/i386/boot/tools/build.c b/arch/i386/boot/tools/build.c index 886f47d8a488..b4248740ff0d 100644 --- a/arch/i386/boot/tools/build.c +++ b/arch/i386/boot/tools/build.c @@ -5,7 +5,7 @@ */ /* - * This file builds a disk-image from three different files: + * This file builds a disk-image from two different files: * * - setup: 8086 machine code, sets up system parm * - system: 80386 code for actual system diff --git a/arch/i386/boot/tty.c b/arch/i386/boot/tty.c index a8db78736b02..9c668aad3515 100644 --- a/arch/i386/boot/tty.c +++ b/arch/i386/boot/tty.c @@ -31,7 +31,7 @@ void __attribute__((section(".inittext"))) putchar(int ch) /* int $0x10 is known to have bugs involving touching registers it shouldn't. Be extra conservative... */ - asm volatile("pushal; int $0x10; popal" + asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); } diff --git a/arch/i386/boot/video.c b/arch/i386/boot/video.c index 3bb3573cd6a1..958130ef0042 100644 --- a/arch/i386/boot/video.c +++ b/arch/i386/boot/video.c @@ -195,7 +195,7 @@ static void vga_recalc_vertical(void) { unsigned int font_size, rows; u16 crtc; - u8 ov; + u8 pt, ov; set_fs(0); font_size = rdfs8(0x485); /* BIOS: font size (pixels) */ @@ -206,7 +206,12 @@ static void vga_recalc_vertical(void) crtc = vga_crtc(); + pt = in_idx(crtc, 0x11); + pt &= ~0x80; /* Unlock CR0-7 */ + out_idx(pt, crtc, 0x11); + out_idx((u8)rows, crtc, 0x12); /* Lower height register */ + ov = in_idx(crtc, 0x07); /* Overflow register */ ov &= 0xbd; ov |= (rows >> (8-1)) & 0x02; @@ -411,7 +416,7 @@ static void restore_screen(void) "1: rep;stosl ; " "popw %%es" : "+D" (dst), "+c" (npad) - : "bdSm" (video_segment), + : "bdS" (video_segment), "a" (0x07200720)); } diff --git a/arch/i386/boot/video.h b/arch/i386/boot/video.h index 29eca1710b2c..b92447d51213 100644 --- a/arch/i386/boot/video.h +++ b/arch/i386/boot/video.h @@ -117,8 +117,15 @@ extern int graphic_mode; /* Graphics mode with linear frame buffer */ * int $0x10 is notorious for touching registers it shouldn't. * gcc doesn't like %ebp being clobbered, so define it as a push/pop * sequence here. + * + * A number of systems, including the original PC can clobber %bp in + * certain circumstances, like when scrolling. There exists at least + * one Trident video card which could clobber DS under a set of + * circumstances that we are unlikely to encounter (scrolling when + * using an extended graphics mode of more than 800x600 pixels), but + * it's cheap insurance to deal with that here. */ -#define INT10 "pushl %%ebp; int $0x10; popl %%ebp" +#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp" /* Accessing VGA indexed registers */ static inline u8 in_idx(u16 port, u8 index) diff --git a/arch/i386/boot/voyager.c b/arch/i386/boot/voyager.c index 9221614d0db8..61c8fe0453be 100644 --- a/arch/i386/boot/voyager.c +++ b/arch/i386/boot/voyager.c @@ -32,7 +32,7 @@ int query_voyager(void) "setc %0 ; " "movw %%es, %1 ; " "popw %%es" - : "=qm" (err), "=rm" (es), "=D" (di) + : "=q" (err), "=r" (es), "=D" (di) : "a" (0xffc0)); if (err) diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 1a3a2217b7c2..54ee1764fdae 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22-rc2 -# Mon May 21 13:23:44 2007 +# Linux kernel version: 2.6.22-git14 +# Fri Jul 20 09:53:15 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -37,19 +37,18 @@ CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y -# CONFIG_IPC_NS is not set CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -# CONFIG_UTS_NS is not set +# CONFIG_USER_NS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=18 # CONFIG_CPUSETS is not set CONFIG_SYSFS_DEPRECATED=y -# CONFIG_RELAY is not set +CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -73,16 +72,13 @@ CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLAB=y -# CONFIG_SLUB is not set +CONFIG_SLUB_DEBUG=y +# CONFIG_SLAB is not set +CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 - -# -# Loadable module support -# CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y @@ -90,14 +86,11 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set CONFIG_STOP_MACHINE=y - -# -# Block layer -# CONFIG_BLOCK=y CONFIG_LBD=y # CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_LSF is not set +# CONFIG_BLK_DEV_BSG is not set # # IO Schedulers @@ -166,7 +159,6 @@ CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y -CONFIG_X86_CMPXCHG64=y CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y @@ -202,6 +194,7 @@ CONFIG_X86_CPUID=y # CONFIG_EDD is not set # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set +CONFIG_DMIID=y # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -218,7 +211,9 @@ CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y CONFIG_NR_QUICK=1 +CONFIG_VIRT_TO_BUS=y # CONFIG_HIGHPTE is not set # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y @@ -245,7 +240,6 @@ CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_PM=y CONFIG_PM_LEGACY=y # CONFIG_PM_DEBUG is not set -# CONFIG_PM_SYSFS_DEPRECATED is not set # # ACPI (Advanced Configuration and Power Interface) Support @@ -285,7 +279,7 @@ CONFIG_CPU_FREQ_GOV_PERFORMANCE=y # CONFIG_CPU_FREQ_GOV_POWERSAVE is not set CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y # # CPUFreq processor drivers @@ -326,7 +320,7 @@ CONFIG_PCI_MMCONFIG=y CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set -CONFIG_HT_IRQ=y +# CONFIG_HT_IRQ is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -382,7 +376,7 @@ CONFIG_IP_PNP_DHCP=y CONFIG_INET_TUNNEL=y CONFIG_INET_XFRM_MODE_TRANSPORT=y CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_INET_XFRM_MODE_BEET=y +# CONFIG_INET_XFRM_MODE_BEET is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -401,27 +395,15 @@ CONFIG_IPV6=y # CONFIG_INET6_TUNNEL is not set CONFIG_INET6_XFRM_MODE_TRANSPORT=y CONFIG_INET6_XFRM_MODE_TUNNEL=y -CONFIG_INET6_XFRM_MODE_BEET=y +# CONFIG_INET6_XFRM_MODE_BEET is not set # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set CONFIG_IPV6_SIT=y # CONFIG_IPV6_TUNNEL is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set - -# -# DCCP Configuration (EXPERIMENTAL) -# # CONFIG_IP_DCCP is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# # CONFIG_IP_SCTP is not set - -# -# TIPC Configuration (EXPERIMENTAL) -# # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set @@ -458,6 +440,7 @@ CONFIG_IPV6_SIT=y # CONFIG_MAC80211 is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set # # Device Drivers @@ -472,21 +455,9 @@ CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set # CONFIG_DEBUG_DEVRES is not set # CONFIG_SYS_HYPERVISOR is not set - -# -# Connector - unified userspace <-> kernelspace linker -# # CONFIG_CONNECTOR is not set # CONFIG_MTD is not set - -# -# Parallel port support -# # CONFIG_PARPORT is not set - -# -# Plug and Play support -# CONFIG_PNP=y # CONFIG_PNP_DEBUG is not set @@ -494,10 +465,7 @@ CONFIG_PNP=y # Protocols # CONFIG_PNPACPI=y - -# -# Block devices -# +CONFIG_BLK_DEV=y CONFIG_BLK_DEV_FD=y # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set @@ -515,17 +483,14 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set - -# -# Misc devices -# +CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set +# CONFIG_EEPROM_93CX6 is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set -# CONFIG_BLINK is not set CONFIG_IDE=y CONFIG_BLK_DEV_IDE=y @@ -597,6 +562,7 @@ CONFIG_BLK_DEV_IDEDMA=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_DMA=y # CONFIG_SCSI_TGT is not set CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set @@ -607,8 +573,9 @@ CONFIG_SCSI_NETLINK=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set +CONFIG_BLK_DEV_SR=y +# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_CHR_DEV_SG=y # CONFIG_CHR_DEV_SCH is not set # @@ -668,6 +635,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_STEX is not set # CONFIG_SCSI_SYM53C8XX_2 is not set +# CONFIG_SCSI_IPR is not set # CONFIG_SCSI_QLOGIC_1280 is not set # CONFIG_SCSI_QLA_FC is not set # CONFIG_SCSI_QLA_ISCSI is not set @@ -676,14 +644,73 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_NSP32 is not set # CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_ESP_CORE is not set # CONFIG_SCSI_SRP is not set -# CONFIG_ATA is not set - -# -# Multi-device support (RAID and LVM) -# -# CONFIG_MD is not set +CONFIG_ATA=y +# CONFIG_ATA_NONSTANDARD is not set +CONFIG_ATA_ACPI=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SVW=y +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +CONFIG_SATA_NV=y +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +CONFIG_SATA_SIL=y +# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +CONFIG_SATA_VIA=y +# CONFIG_SATA_VITESSE is not set +# CONFIG_SATA_INIC162X is not set +# CONFIG_PATA_ALI is not set +# CONFIG_PATA_AMD is not set +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD640_PCI is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CS5535 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_IT8213 is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MARVELL is not set +# CONFIG_PATA_MPIIX is not set +# CONFIG_PATA_OLDPIIX is not set +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set +CONFIG_MD=y +# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_DM=y +# CONFIG_DM_DEBUG is not set +# CONFIG_DM_CRYPT is not set +# CONFIG_DM_SNAPSHOT is not set +# CONFIG_DM_MIRROR is not set +# CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set +# CONFIG_DM_DELAY is not set # # Fusion MPT device support @@ -724,42 +751,27 @@ CONFIG_IEEE1394_OHCI1394=y # CONFIG_IEEE1394_ETH1394 is not set # CONFIG_IEEE1394_DV1394 is not set CONFIG_IEEE1394_RAWIO=y - -# -# I2O device support -# # CONFIG_I2O is not set -# CONFIG_MACINTOSH_DRIVERS is not set - -# -# Network device support -# +CONFIG_MACINTOSH_DRIVERS=y +# CONFIG_MAC_EMUMOUSEBTN is not set CONFIG_NETDEVICES=y +CONFIG_NETDEVICES_MULTIQUEUE=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set # CONFIG_NET_SB1000 is not set - -# -# ARCnet devices -# # CONFIG_ARCNET is not set # CONFIG_PHYLIB is not set - -# -# Ethernet (10 or 100Mbit) -# CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set -# CONFIG_NET_VENDOR_3COM is not set - -# -# Tulip family network device support -# +CONFIG_NET_VENDOR_3COM=y +CONFIG_VORTEX=y +# CONFIG_TYPHOON is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set CONFIG_TULIP=y @@ -810,7 +822,6 @@ CONFIG_R8169=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set CONFIG_SKY2=y -# CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y CONFIG_BNX2=y @@ -824,10 +835,6 @@ CONFIG_NETDEV_10000=y # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_MLX4_CORE is not set - -# -# Token Ring devices -# # CONFIG_TR is not set # @@ -856,15 +863,7 @@ CONFIG_NETCONSOLE=y CONFIG_NETPOLL=y # CONFIG_NETPOLL_TRAP is not set CONFIG_NET_POLL_CONTROLLER=y - -# -# ISDN subsystem -# # CONFIG_ISDN is not set - -# -# Telephony Support -# # CONFIG_PHONE is not set # @@ -872,6 +871,7 @@ CONFIG_NET_POLL_CONTROLLER=y # CONFIG_INPUT=y # CONFIG_INPUT_FF_MEMLESS is not set +# CONFIG_INPUT_POLLDEV is not set # # Userland interfaces @@ -937,6 +937,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_FIX_EARLYCON_MEM=y CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_NR_UARTS=4 @@ -952,10 +953,6 @@ CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 - -# -# IPMI -# # CONFIG_IPMI_HANDLER is not set # CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y @@ -989,11 +986,7 @@ CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set CONFIG_HPET_MMAP=y -CONFIG_HANGCHECK_TIMER=y - -# -# TPM devices -# +# CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set CONFIG_DEVPORT=y @@ -1004,11 +997,8 @@ CONFIG_DEVPORT=y # # CONFIG_SPI is not set # CONFIG_SPI_MASTER is not set - -# -# Dallas's 1-wire bus -# # CONFIG_W1 is not set +# CONFIG_POWER_SUPPLY is not set # CONFIG_HWMON is not set # @@ -1042,7 +1032,7 @@ CONFIG_DAB=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 -# CONFIG_VIDEO_SELECT is not set +CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # @@ -1059,15 +1049,11 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -# CONFIG_OSS_OBSOLETE is not set # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_OSS is not set - -# -# HID Devices -# +CONFIG_HID_SUPPORT=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set @@ -1078,10 +1064,7 @@ CONFIG_USB_HID=y # CONFIG_USB_HIDINPUT_POWERBOOK is not set # CONFIG_HID_FF is not set # CONFIG_USB_HIDDEV is not set - -# -# USB support -# +CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB_ARCH_HAS_EHCI=y @@ -1095,6 +1078,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set +# CONFIG_USB_PERSIST is not set # CONFIG_USB_OTG is not set # @@ -1104,7 +1088,6 @@ CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set -# CONFIG_USB_EHCI_BIG_ENDIAN_MMIO is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN_DESC is not set @@ -1112,6 +1095,7 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set +# CONFIG_USB_R8A66597_HCD is not set # # USB Device Class drivers @@ -1202,15 +1186,7 @@ CONFIG_USB_MON=y # # LED Triggers # - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set - -# -# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) -# # CONFIG_EDAC is not set # @@ -1230,11 +1206,13 @@ CONFIG_USB_MON=y # # DMA Devices # +CONFIG_VIRTUALIZATION=y +# CONFIG_KVM is not set # -# Virtualization +# Userspace I/O # -# CONFIG_KVM is not set +# CONFIG_UIO is not set # # File systems @@ -1272,6 +1250,7 @@ CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set +CONFIG_GENERIC_ACL=y # # CD-ROM/DVD Filesystems @@ -1299,7 +1278,7 @@ CONFIG_PROC_KCORE=y CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y -# CONFIG_TMPFS_POSIX_ACL is not set +CONFIG_TMPFS_POSIX_ACL=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_RAMFS=y @@ -1349,7 +1328,6 @@ CONFIG_SUNRPC=y # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set -# CONFIG_9P_FS is not set # # Partition Types @@ -1405,10 +1383,7 @@ CONFIG_NLS_UTF8=y # Distributed Lock Manager # # CONFIG_DLM is not set - -# -# Instrumentation Support -# +CONFIG_INSTRUMENTATION=y CONFIG_PROFILING=y CONFIG_OPROFILE=y CONFIG_KPROBES=y @@ -1418,7 +1393,7 @@ CONFIG_KPROBES=y # CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set -CONFIG_ENABLE_MUST_CHECK=y +# CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y CONFIG_UNUSED_SYMBOLS=y # CONFIG_DEBUG_FS is not set @@ -1426,15 +1401,17 @@ CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_SLAB is not set +CONFIG_TIMER_STATS=y +# CONFIG_SLUB_DEBUG_ON is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -1444,7 +1421,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_UNWIND_INFO is not set # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set @@ -1463,10 +1439,6 @@ CONFIG_DOUBLEFAULT=y # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set - -# -# Cryptographic options -# # CONFIG_CRYPTO is not set # @@ -1477,6 +1449,7 @@ CONFIG_BITREVERSE=y # CONFIG_CRC16 is not set # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y +# CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y CONFIG_PLIST=y diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 06da59f6f837..dbe5e87e0d66 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_MGEODE_LX) += geode.o obj-$(CONFIG_VMI) += vmi.o vmiclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index a574cd2c8b61..cacdd883bf2b 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -618,6 +618,8 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table) #ifdef CONFIG_HPET_TIMER #include <asm/hpet.h> +static struct __initdata resource *hpet_res; + static int __init acpi_parse_hpet(struct acpi_table_header *table) { struct acpi_table_hpet *hpet_tbl; @@ -638,8 +640,42 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); + /* + * Allocate and initialize the HPET firmware resource for adding into + * the resource tree during the lateinit timeframe. + */ +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + + if (!hpet_res) + return 0; + + memset(hpet_res, 0, sizeof(*hpet_res)); + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u", + hpet_tbl->sequence); + + hpet_res->start = hpet_address; + hpet_res->end = hpet_address + (1 * 1024) - 1; + return 0; } + +/* + * hpet_insert_resource inserts the HPET resources used into the resource + * tree. + */ +static __init int hpet_insert_resource(void) +{ + if (!hpet_res) + return 1; + + return insert_resource(&iomem_resource, hpet_res); +} + +late_initcall(hpet_insert_resource); + #else #define acpi_parse_hpet NULL #endif @@ -950,14 +986,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, { .callback = force_acpi_ht, - .ident = "DELL GX240", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), - }, - }, - { - .callback = force_acpi_ht, .ident = "HP VISUALIZE NT Workstation", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c index 4ee83577bf61..c42b5ab49deb 100644 --- a/arch/i386/kernel/acpi/sleep.c +++ b/arch/i386/kernel/acpi/sleep.c @@ -14,7 +14,7 @@ /* address in low memory of the wakeup routine. */ unsigned long acpi_wakeup_address = 0; -unsigned long acpi_video_flags; +unsigned long acpi_realmode_flags; extern char wakeup_start, wakeup_end; extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); @@ -68,9 +68,11 @@ static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "s3_bios", 7) == 0) - acpi_video_flags = 1; + acpi_realmode_flags |= 1; if (strncmp(str, "s3_mode", 7) == 0) - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); @@ -80,9 +82,11 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); +/* Ouch, we want to delete this. We already have better version in userspace, in + s2ram from suspend.sf.net project */ static __init int reset_videomode_after_s3(struct dmi_system_id *d) { - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; return 0; } diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S index a2295a34b2c7..ed0a0f2c1597 100644 --- a/arch/i386/kernel/acpi/wakeup.S +++ b/arch/i386/kernel/acpi/wakeup.S @@ -13,6 +13,21 @@ # cs = 0x1234, eip = 0x05 # +#define BEEP \ + inb $97, %al; \ + outb %al, $0x80; \ + movb $3, %al; \ + outb %al, $97; \ + outb %al, $0x80; \ + movb $-74, %al; \ + outb %al, $67; \ + outb %al, $0x80; \ + movb $-119, %al; \ + outb %al, $66; \ + outb %al, $0x80; \ + movb $15, %al; \ + outb %al, $66; + ALIGN .align 4096 ENTRY(wakeup_start) @@ -31,6 +46,11 @@ wakeup_code: movw %cs, %ax movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss + + testl $4, realmode_flags - wakeup_code + jz 1f + BEEP +1: mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board movw $0x0e00 + 'S', %fs:(0x12) @@ -41,7 +61,7 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - testl $1, video_flags - wakeup_code + testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 movw %cs, %ax @@ -49,7 +69,7 @@ wakeup_code: movw %ax, %ss 1: - testl $2, video_flags - wakeup_code + testl $2, realmode_flags - wakeup_code jz 1f mov video_mode - wakeup_code, %ax call mode_set @@ -88,7 +108,11 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - ljmpl $__KERNEL_CS,$wakeup_pmode_return + testl $8, realmode_flags - wakeup_code + jz 1f + BEEP +1: + ljmpl $__KERNEL_CS, $wakeup_pmode_return real_save_gdt: .word 0 .long 0 @@ -97,7 +121,8 @@ real_save_cr3: .long 0 real_save_cr4: .long 0 real_magic: .long 0 video_mode: .long 0 -video_flags: .long 0 +realmode_flags: .long 0 +beep_flags: .long 0 real_efer_save_restore: .long 0 real_save_efer_edx: .long 0 real_save_efer_eax: .long 0 @@ -260,8 +285,8 @@ ENTRY(acpi_copy_wakeup_routine) movl saved_videomode, %edx movl %edx, video_mode - wakeup_start (%eax) - movl acpi_video_flags, %edx - movl %edx, video_flags - wakeup_start (%eax) + movl acpi_realmode_flags, %edx + movl %edx, realmode_flags - wakeup_start (%eax) movl $0x12345678, real_magic - wakeup_start (%eax) movl $0x12345678, saved_magic popl %ebx diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index d8cda14fff8b..c3750c2c4113 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -2,12 +2,17 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/list.h> +#include <linux/kprobes.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> #include <asm/alternative.h> #include <asm/sections.h> +#include <asm/pgtable.h> +#include <asm/mce.h> +#include <asm/nmi.h> -static int noreplace_smp = 0; -static int smp_alt_once = 0; -static int debug_alternative = 0; +#ifdef CONFIG_HOTPLUG_CPU +static int smp_alt_once; static int __init bootonly(char *str) { @@ -15,6 +20,11 @@ static int __init bootonly(char *str) return 1; } __setup("smp-alt-boot", bootonly); +#else +#define smp_alt_once 1 +#endif + +static int debug_alternative; static int __init debug_alt(char *str) { @@ -23,6 +33,8 @@ static int __init debug_alt(char *str) } __setup("debug-alternative", debug_alt); +static int noreplace_smp; + static int __init setup_noreplace_smp(char *str) { noreplace_smp = 1; @@ -144,7 +156,7 @@ static void nop_out(void *insns, unsigned int len) unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, noptable[noplen], noplen); + text_poke(insns, noptable[noplen], noplen); insns += noplen; len -= noplen; } @@ -196,7 +208,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) continue; if (*ptr > text_end) continue; - **ptr = 0xf0; /* lock prefix */ + text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ }; } @@ -354,10 +366,6 @@ void apply_paravirt(struct paravirt_patch_site *start, /* Pad the rest with nops */ nop_out(p->instr + used, p->len - used); } - - /* Sync to be conservative, in case we patched following - * instructions */ - sync_core(); } extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; @@ -367,6 +375,14 @@ void __init alternative_instructions(void) { unsigned long flags; + /* The patching is not fully atomic, so try to avoid local interruptions + that might execute the to be patched code. + Other CPUs are not running. */ + stop_nmi(); +#ifdef CONFIG_MCE + stop_mce(); +#endif + local_irq_save(flags); apply_alternatives(__alt_instructions, __alt_instructions_end); @@ -376,8 +392,6 @@ void __init alternative_instructions(void) #ifdef CONFIG_HOTPLUG_CPU if (num_possible_cpus() < 2) smp_alt_once = 1; -#else - smp_alt_once = 1; #endif #ifdef CONFIG_SMP @@ -401,4 +415,37 @@ void __init alternative_instructions(void) #endif apply_paravirt(__parainstructions, __parainstructions_end); local_irq_restore(flags); + + restart_nmi(); +#ifdef CONFIG_MCE + restart_mce(); +#endif +} + +/* + * Warning: + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these instructions. + * And on the local CPU you need to be protected again NMI or MCE handlers + * seeing an inconsistent instruction while you patch. + */ +void __kprobes text_poke(void *oaddr, unsigned char *opcode, int len) +{ + u8 *addr = oaddr; + if (!pte_write(*lookup_address((unsigned long)addr))) { + struct page *p[2] = { virt_to_page(addr), virt_to_page(addr+PAGE_SIZE) }; + addr = vmap(p, 2, VM_MAP, PAGE_KERNEL); + if (!addr) + return; + addr += ((unsigned long)oaddr) % PAGE_SIZE; + } + memcpy(addr, opcode, len); + sync_core(); + /* Not strictly needed, but can speed CPU recovery up. Ignore cross cacheline + case. */ + if (cpu_has_clflush) + asm("clflush (%0) " :: "r" (oaddr) : "memory"); + if (addr != oaddr) + vunmap(addr); } diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 67824f3bb974..bfc6cb7df7e7 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -263,6 +263,9 @@ static void lapic_timer_setup(enum clock_event_mode mode, v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write_around(APIC_LVTT, v); break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; } local_irq_restore(flags); @@ -315,7 +318,7 @@ static void __devinit setup_APIC_timer(void) #define LAPIC_CAL_LOOPS (HZ/10) -static __initdata volatile int lapic_cal_loops = -1; +static __initdata int lapic_cal_loops = -1; static __initdata long lapic_cal_t1, lapic_cal_t2; static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; @@ -485,7 +488,7 @@ void __init setup_boot_APIC_clock(void) /* Let the interrupts run */ local_irq_enable(); - while(lapic_cal_loops <= LAPIC_CAL_LOOPS) + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) cpu_relax(); local_irq_disable(); @@ -521,6 +524,9 @@ void __init setup_boot_APIC_clock(void) */ if (nmi_watchdog != NMI_IO_APIC) lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); } /* Setup the lapic or request the broadcast */ diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 4112afe712b9..47001d50a083 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -222,6 +222,7 @@ #include <linux/capability.h> #include <linux/device.h> #include <linux/kernel.h> +#include <linux/freezer.h> #include <linux/smp.h> #include <linux/dmi.h> #include <linux/suspend.h> @@ -2311,7 +2312,6 @@ static int __init apm_init(void) remove_proc_entry("apm", NULL); return err; } - kapmd_task->flags |= PF_NOFREEZE; wake_up_process(kapmd_task); if (num_online_cpus() > 1 && !smp ) { diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 27a776c9044d..7288ac88d746 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -17,6 +17,13 @@ #include <asm/thread_info.h> #include <asm/elf.h> +#include <xen/interface/xen.h> + +#ifdef CONFIG_LGUEST_GUEST +#include <linux/lguest.h> +#include "../../../drivers/lguest/lg.h" +#endif + #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -59,6 +66,7 @@ void foo(void) OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); + OFFSET(TI_cpu, thread_info, cpu); BLANK(); OFFSET(GDS_size, Xgt_desc_struct, size); @@ -115,4 +123,25 @@ void foo(void) OFFSET(PARAVIRT_iret, paravirt_ops, iret); OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); #endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + +#ifdef CONFIG_LGUEST_GUEST + BLANK(); + OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); + OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); + OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); + OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); + OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); + OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); + OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); + OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); + OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); + OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); +#endif } diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 0b6a8551e9e2..778396c78d65 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -9,7 +9,6 @@ obj-y += cyrix.o obj-y += centaur.o obj-y += transmeta.o obj-y += intel.o intel_cacheinfo.o addon_cpuid_features.o -obj-y += rise.o obj-y += nexgen.o obj-y += umc.o diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 6f47eeeb93ea..c7ba455d5ac7 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -231,6 +231,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) switch (c->x86) { case 15: + /* Use K8 tuning for Fam10h and Fam11h */ + case 0x10: + case 0x11: set_bit(X86_FEATURE_K8, c->x86_capability); break; case 6: @@ -272,8 +275,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #endif - if (cpuid_eax(0x80000000) >= 0x80000006) - num_cache_leaves = 3; + if (cpuid_eax(0x80000000) >= 0x80000006) { + if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } if (amd_apic_timer_broken()) set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index e5419a9dec88..d506201d397c 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -606,7 +606,6 @@ extern int nsc_init_cpu(void); extern int amd_init_cpu(void); extern int centaur_init_cpu(void); extern int transmeta_init_cpu(void); -extern int rise_init_cpu(void); extern int nexgen_init_cpu(void); extern int umc_init_cpu(void); @@ -618,7 +617,6 @@ void __init early_cpu_init(void) amd_init_cpu(); centaur_init_cpu(); transmeta_init_cpu(); - rise_init_cpu(); nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c index 18c8b67ea3a7..6f846bee2103 100644 --- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -665,8 +665,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) data->max_freq = perf->states[0].core_frequency * 1000; /* table init */ for (i=0; i<perf->state_count; i++) { - if (i>0 && perf->states[i].core_frequency == - perf->states[i-1].core_frequency) + if (i>0 && perf->states[i].core_frequency >= + data->freq_table[valid_states-1].frequency / 1000) continue; data->freq_table[valid_states].index = i; diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c index 194144539a6f..461dabc4e495 100644 --- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c @@ -79,7 +79,7 @@ #include <linux/smp.h> #include <linux/cpufreq.h> #include <linux/pci.h> -#include <asm/processor.h> +#include <asm/processor-cyrix.h> #include <asm/errno.h> /* PCI config registers, all at F0 */ diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index e88d2fba156b..122d2d75aa9f 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -4,7 +4,7 @@ #include <linux/pci.h> #include <asm/dma.h> #include <asm/io.h> -#include <asm/processor.h> +#include <asm/processor-cyrix.h> #include <asm/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index e5be819492ef..d5a456d27d82 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -4,7 +4,7 @@ * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen : CPUID4 emulation on AMD. + * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ #include <linux/init.h> @@ -135,7 +135,7 @@ unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: - No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs. + L2 not shared, no SMT etc. that is currently true on AMD CPUs. In theory the TLBs could be reported as fake type (they are in "dummy"). Maybe later */ @@ -159,13 +159,26 @@ union l2_cache { unsigned val; }; +union l3_cache { + struct { + unsigned line_size : 8; + unsigned lines_per_tag : 4; + unsigned assoc : 4; + unsigned res : 2; + unsigned size_encoded : 14; + }; + unsigned val; +}; + static const unsigned short assocs[] = { [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, + [8] = 16, [0xa] = 32, [0xb] = 48, + [0xc] = 64, [0xf] = 0xffff // ?? - }; -static const unsigned char levels[] = { 1, 1, 2 }; -static const unsigned char types[] = { 1, 2, 3 }; +}; + +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, @@ -175,37 +188,58 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, unsigned line_size, lines_per_tag, assoc, size_in_kb; union l1_cache l1i, l1d; union l2_cache l2; + union l3_cache l3; + union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; ecx->full = 0; cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); - cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy); - - if (leaf > 2 || !l1d.val || !l1i.val || !l2.val) - return; - - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; - - if (leaf <= 1) { - union l1_cache *l1 = leaf == 0 ? &l1d : &l1i; + cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); + + switch (leaf) { + case 1: + l1 = &l1i; + case 0: + if (!l1->val) + return; assoc = l1->assoc; line_size = l1->line_size; lines_per_tag = l1->lines_per_tag; size_in_kb = l1->size_in_kb; - } else { + break; + case 2: + if (!l2.val) + return; assoc = l2.assoc; line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ size_in_kb = current_cpu_data.x86_cache_size; + break; + case 3: + if (!l3.val) + return; + assoc = l3.assoc; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; + break; + default: + return; } + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + if (leaf == 3) + eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + else + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + + if (assoc == 0xf) eax->split.is_fully_associative = 1; ebx->split.coherency_line_size = line_size - 1; @@ -239,8 +273,7 @@ static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le return 0; } -/* will only be called once; __init is safe here */ -static int __init find_num_cache_leaves(void) +static int __cpuinit find_num_cache_leaves(void) { unsigned int eax, ebx, ecx, edx; union _cpuid4_leaf_eax cache_eax; @@ -710,7 +743,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return retval; } -static void __cpuexit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) { unsigned int cpu = sys_dev->id; unsigned long i; diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index 56cd485b127c..34c781eddee4 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -60,6 +60,20 @@ void mcheck_init(struct cpuinfo_x86 *c) } } +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + static int __init mcheck_disable(char *str) { mce_disabled = 1; diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c index 6b5d3518a1c0..bf39409b3838 100644 --- a/arch/i386/kernel/cpu/mcheck/non-fatal.c +++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c @@ -57,7 +57,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); static void mce_work_fn(struct work_struct *work) { on_each_cpu(mce_checkregs, NULL, 1, 1); - schedule_delayed_work(&mce_work, MCE_RATE); + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); } static int __init init_nonfatal_mce_checker(void) @@ -82,7 +82,7 @@ static int __init init_nonfatal_mce_checker(void) /* * Check for non-fatal errors every MCE_RATE s */ - schedule_delayed_work(&mce_work, MCE_RATE); + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); printk(KERN_INFO "Machine check exception polling timer started.\n"); return 0; } diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 7ba7c3abd3a4..1203dc5ab87a 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -134,19 +134,21 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, int err; sys_dev = get_cpu_sysdev(cpu); - mutex_lock(&therm_cpu_lock); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); break; } - mutex_unlock(&therm_cpu_lock); return NOTIFY_OK; } diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c index 1001f1e0fe6d..2287d4863a8a 100644 --- a/arch/i386/kernel/cpu/mtrr/cyrix.c +++ b/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -3,6 +3,7 @@ #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/processor-cyrix.h> #include "mtrr.h" int arr3_protected; diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f6e46943e6ef..56f64e34829f 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -79,7 +79,7 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types) } /* Grab all of the MTRR state for this CPU into *state */ -void get_mtrr_state(void) +void __init get_mtrr_state(void) { unsigned int i; struct mtrr_var_range *vrs; diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 75dc6d5214bc..c48b6fea5ab4 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -643,7 +643,7 @@ static struct sysdev_driver mtrr_sysdev_driver = { * initialized (i.e. before smp_init()). * */ -__init void mtrr_bp_init(void) +void __init mtrr_bp_init(void) { init_ifs(); diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c index 7b39a2f954d9..c9014ca4a575 100644 --- a/arch/i386/kernel/cpu/mtrr/state.c +++ b/arch/i386/kernel/cpu/mtrr/state.c @@ -3,6 +3,7 @@ #include <asm/io.h> #include <asm/mtrr.h> #include <asm/msr.h> +#include <asm-i386/processor-cyrix.h> #include "mtrr.h" diff --git a/arch/i386/kernel/cpu/perfctr-watchdog.c b/arch/i386/kernel/cpu/perfctr-watchdog.c index 4d26d514c56f..4be488e73bee 100644 --- a/arch/i386/kernel/cpu/perfctr-watchdog.c +++ b/arch/i386/kernel/cpu/perfctr-watchdog.c @@ -325,7 +325,7 @@ static struct wd_ops k7_wd_ops = { .stop = single_msr_stop_watchdog, .perfctr = MSR_K7_PERFCTR0, .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL<<63, + .checkbit = 1ULL<<47, }; /* Intel Model 6 (PPro+,P2,P3,P-M,Core1) */ @@ -346,7 +346,9 @@ static int setup_p6_watchdog(unsigned nmi_hz) perfctr_msr = MSR_P6_PERFCTR0; evntsel_msr = MSR_P6_EVNTSEL0; - wrmsrl(perfctr_msr, 0UL); + /* KVM doesn't implement this MSR */ + if (wrmsr_safe(perfctr_msr, 0, 0) < 0) + return 0; evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS @@ -599,8 +601,8 @@ static struct wd_ops intel_arch_wd_ops = { .setup = setup_intel_arch_watchdog, .rearm = p6_rearm, .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR1, + .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, }; static void probe_nmi_watchdog(void) diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c deleted file mode 100644 index 50076f22e90f..000000000000 --- a/arch/i386/kernel/cpu/rise.c +++ /dev/null @@ -1,52 +0,0 @@ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <asm/processor.h> - -#include "cpu.h" - -static void __cpuinit init_rise(struct cpuinfo_x86 *c) -{ - printk("CPU: Rise iDragon"); - if (c->x86_model > 2) - printk(" II"); - printk("\n"); - - /* Unhide possibly hidden capability flags - The mp6 iDragon family don't have MSRs. - We switch on extra features with this cpuid weirdness: */ - __asm__ ( - "movl $0x6363452a, %%eax\n\t" - "movl $0x3231206c, %%ecx\n\t" - "movl $0x2a32313a, %%edx\n\t" - "cpuid\n\t" - "movl $0x63634523, %%eax\n\t" - "movl $0x32315f6c, %%ecx\n\t" - "movl $0x2333313a, %%edx\n\t" - "cpuid\n\t" : : : "eax", "ebx", "ecx", "edx" - ); - set_bit(X86_FEATURE_CX8, c->x86_capability); -} - -static struct cpu_dev rise_cpu_dev __cpuinitdata = { - .c_vendor = "Rise", - .c_ident = { "RiseRiseRise" }, - .c_models = { - { .vendor = X86_VENDOR_RISE, .family = 5, .model_names = - { - [0] = "iDragon", - [2] = "iDragon", - [8] = "iDragon II", - [9] = "iDragon II" - } - }, - }, - .c_init = init_rise, -}; - -int __init rise_init_cpu(void) -{ - cpu_devs[X86_VENDOR_RISE] = &rise_cpu_dev; - return 0; -} - diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index fc822a46897a..e60cddbc4cfb 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -10,6 +10,7 @@ #include <linux/efi.h> #include <linux/pfn.h> #include <linux/uaccess.h> +#include <linux/suspend.h> #include <asm/pgtable.h> #include <asm/page.h> @@ -320,6 +321,37 @@ static int __init request_standard_resources(void) subsys_initcall(request_standard_resources); +#if defined(CONFIG_PM) && defined(CONFIG_SOFTWARE_SUSPEND) +/** + * e820_mark_nosave_regions - Find the ranges of physical addresses that do not + * correspond to e820 RAM areas and mark the corresponding pages as nosave for + * hibernation. + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(void) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= max_low_pfn) + break; + } +} +#endif + void __init add_memory_region(unsigned long long start, unsigned long long size, int type) { diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index a1808022ea19..2452c6fbe992 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -278,7 +278,7 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) struct range { unsigned long start; unsigned long end; - } prev, curr; + } uninitialized_var(prev), curr; efi_memory_desc_t *md; unsigned long start, end; void *p; diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3c3c220488c9..a714d6b43506 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -409,8 +409,6 @@ restore_nocheck_notrace: 1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -1023,6 +1021,91 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + call xen_iret_crit_fixup + +1: mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + addl $16,%esp + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) +5: pushl $0 # EAX == 0 => Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous +ENDPROC(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ + .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/geode.c b/arch/i386/kernel/geode.c new file mode 100644 index 000000000000..41e8aec4c61d --- /dev/null +++ b/arch/i386/kernel/geode.c @@ -0,0 +1,155 @@ +/* + * AMD Geode southbridge support code + * Copyright (C) 2006, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ioport.h> +#include <linux/io.h> +#include <asm/msr.h> +#include <asm/geode.h> + +static struct { + char *name; + u32 msr; + int size; + u32 base; +} lbars[] = { + { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, + { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, + { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, + { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } +}; + +static void __init init_lbars(void) +{ + u32 lo, hi; + int i; + + for (i = 0; i < ARRAY_SIZE(lbars); i++) { + rdmsr(lbars[i].msr, lo, hi); + if (hi & 0x01) + lbars[i].base = lo & 0x0000ffff; + + if (lbars[i].base == 0) + printk(KERN_ERR "geode: Couldn't initialize '%s'\n", + lbars[i].name); + } +} + +int geode_get_dev_base(unsigned int dev) +{ + BUG_ON(dev >= ARRAY_SIZE(lbars)); + return lbars[dev].base; +} +EXPORT_SYMBOL_GPL(geode_get_dev_base); + +/* === GPIO API === */ + +void geode_gpio_set(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << gpio, base + reg); + else + outl(1 << (gpio - 16), base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_set); + +void geode_gpio_clear(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << (gpio + 16), base + reg); + else + outl(1 << gpio, base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_clear); + +int geode_gpio_isset(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return 0; + + if (gpio < 16) + return (inl(base + reg) & (1 << gpio)) ? 1 : 0; + else + return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; +} +EXPORT_SYMBOL_GPL(geode_gpio_isset); + +void geode_gpio_set_irq(unsigned int group, unsigned int irq) +{ + u32 lo, hi; + + if (group > 7 || irq > 15) + return; + + rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); + + lo &= ~(0xF << (group * 4)); + lo |= (irq & 0xF) << (group * 4); + + wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); +} +EXPORT_SYMBOL_GPL(geode_gpio_set_irq); + +void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + u32 offset, shift, val; + + if (gpio >= 24) + offset = GPIO_MAP_W; + else if (gpio >= 16) + offset = GPIO_MAP_Z; + else if (gpio >= 8) + offset = GPIO_MAP_Y; + else + offset = GPIO_MAP_X; + + shift = (gpio % 8) * 4; + + val = inl(base + offset); + + /* Clear whatever was there before */ + val &= ~(0xF << shift); + + /* And set the new value */ + + val |= ((pair & 7) << shift); + + /* Set the PME bit if this is a PME event */ + + if (pme) + val |= (1 << (shift + 3)); + + outl(val, base + offset); +} +EXPORT_SYMBOL_GPL(geode_gpio_setup_event); + +static int __init geode_southbridge_init(void) +{ + if (!is_geode()) + return -ENODEV; + + init_lbars(); + return 0; +} + +postcore_initcall(geode_southbridge_init); diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index f74dfc419b56..7c52b222207e 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -168,6 +168,12 @@ page_pde_offset = (__PAGE_OFFSET >> 20); .section .init.text,"ax",@progbits #endif + /* Do an early initialization of the fixmap area */ + movl $(swapper_pg_dir - __PAGE_OFFSET), %edx + movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax + addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ + movl %eax, 4092(%edx) + #ifdef CONFIG_SMP ENTRY(startup_32_smp) cld @@ -504,9 +510,12 @@ ENTRY(_stext) /* * BSS section */ -.section ".bss.page_aligned","w" +.section ".bss.page_aligned","wa" + .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) .fill 1024,4,0 +ENTRY(swapper_pg_pmd) + .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 @@ -530,6 +539,8 @@ fault_msg: .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" .asciz "Stack: %p %p %p %p %p %p %p %p\n" +#include "../xen/xen-head.S" + /* * The IDT and GDT 'descriptors' are a strange 48-bit object * only used by the lidt and lgdt instructions. They are not diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 17d73459fc5f..533d4932bc79 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -5,6 +5,7 @@ #include <linux/init.h> #include <linux/sysdev.h> #include <linux/pm.h> +#include <linux/delay.h> #include <asm/hpet.h> #include <asm/io.h> @@ -187,6 +188,10 @@ static void hpet_set_mode(enum clock_event_mode mode, cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T0_CFG); break; + + case CLOCK_EVT_MODE_RESUME: + hpet_enable_int(); + break; } } @@ -217,6 +222,7 @@ static struct clocksource clocksource_hpet = { .mask = HPET_MASK, .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_start_counter, }; /* @@ -226,7 +232,8 @@ int __init hpet_enable(void) { unsigned long id; uint64_t hpet_freq; - u64 tmp; + u64 tmp, start, now; + cycle_t t1; if (!is_hpet_capable()) return 0; @@ -273,6 +280,27 @@ int __init hpet_enable(void) /* Start the counter */ hpet_start_counter(); + /* Verify whether hpet counter works */ + t1 = read_hpet(); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + if (t1 == read_hpet()) { + printk(KERN_WARNING + "HPET counter not counting. HPET disabled\n"); + goto out_nohpet; + } + /* Initialize and register HPET clocksource * * hpet period is in femto seconds per cycle @@ -291,7 +319,6 @@ int __init hpet_enable(void) clocksource_register(&clocksource_hpet); - if (id & HPET_ID_LEGSUP) { hpet_enable_int(); hpet_reserve_platform_timers(id); @@ -299,7 +326,7 @@ int __init hpet_enable(void) * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ - hpet_clockevent.cpumask =cpumask_of_cpu(0); + hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); clockevents_register_device(&hpet_clockevent); global_clock_event = &hpet_clockevent; return 1; @@ -524,68 +551,3 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } #endif - - -/* - * Suspend/resume part - */ - -#ifdef CONFIG_PM - -static int hpet_suspend(struct sys_device *sys_device, pm_message_t state) -{ - unsigned long cfg = hpet_readl(HPET_CFG); - - cfg &= ~(HPET_CFG_ENABLE|HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - - return 0; -} - -static int hpet_resume(struct sys_device *sys_device) -{ - unsigned int id; - - hpet_start_counter(); - - id = hpet_readl(HPET_ID); - - if (id & HPET_ID_LEGSUP) - hpet_enable_int(); - - return 0; -} - -static struct sysdev_class hpet_class = { - set_kset_name("hpet"), - .suspend = hpet_suspend, - .resume = hpet_resume, -}; - -static struct sys_device hpet_device = { - .id = 0, - .cls = &hpet_class, -}; - - -static __init int hpet_register_sysfs(void) -{ - int err; - - if (!is_hpet_capable()) - return 0; - - err = sysdev_class_register(&hpet_class); - - if (!err) { - err = sysdev_register(&hpet_device); - if (err) - sysdev_class_unregister(&hpet_class); - } - - return err; -} - -device_initcall(hpet_register_sysfs); - -#endif diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c index f8a3c4054c70..6d839f2f1b1a 100644 --- a/arch/i386/kernel/i8253.c +++ b/arch/i386/kernel/i8253.c @@ -3,18 +3,17 @@ * */ #include <linux/clockchips.h> -#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/interrupt.h> #include <linux/jiffies.h> -#include <linux/sysdev.h> #include <linux/module.h> -#include <linux/init.h> +#include <linux/spinlock.h> #include <asm/smp.h> #include <asm/delay.h> #include <asm/i8253.h> #include <asm/io.h> - -#include "io_ports.h" +#include <asm/timer.h> DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); @@ -41,26 +40,27 @@ static void init_pit_timer(enum clock_event_mode mode, case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); - udelay(10); outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); outb(LATCH >> 8 , PIT_CH0); /* MSB */ break; - /* - * Avoid unnecessary state transitions, as it confuses - * Geode / Cyrix based boxen. - */ case CLOCK_EVT_MODE_SHUTDOWN: - if (evt->mode == CLOCK_EVT_MODE_UNUSED) - break; case CLOCK_EVT_MODE_UNUSED: - if (evt->mode == CLOCK_EVT_MODE_SHUTDOWN) - break; + if (evt->mode == CLOCK_EVT_MODE_PERIODIC || + evt->mode == CLOCK_EVT_MODE_ONESHOT) { + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } + break; + case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ outb_p(0x38, PIT_MODE); - udelay(10); + break; + + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ break; } spin_unlock_irqrestore(&i8253_lock, flags); diff --git a/arch/i386/kernel/init_task.c b/arch/i386/kernel/init_task.c index cff95d10a4d8..d26fc063a760 100644 --- a/arch/i386/kernel/init_task.c +++ b/arch/i386/kernel/init_task.c @@ -42,5 +42,5 @@ EXPORT_SYMBOL(init_task); * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 7f8b7af2b95f..893df8280756 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -353,14 +353,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include <linux/slab.h> /* kmalloc() */ # include <linux/timer.h> /* time_after() */ -#ifdef CONFIG_BALANCED_IRQ_DEBUG -# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) -# define Dprintk(x...) do { TDprintk(x); } while (0) -# else -# define TDprintk(x...) -# define Dprintk(x...) -# endif - #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) #define MIN_BALANCED_IRQ_INTERVAL (HZ/2) @@ -443,7 +435,7 @@ static inline void balance_irq(int cpu, int irq) static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) { int i, j; - Dprintk("Rotating IRQs among CPUs.\n"); + for_each_online_cpu(i) { for (j = 0; j < NR_IRQS; j++) { if (!irq_desc[j].action) @@ -560,19 +552,11 @@ tryanothercpu: max_loaded = tmp_loaded; /* processor */ imbalance = (max_cpu_irq - min_cpu_irq) / 2; - Dprintk("max_loaded cpu = %d\n", max_loaded); - Dprintk("min_loaded cpu = %d\n", min_loaded); - Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); - Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); - Dprintk("load imbalance = %lu\n", imbalance); - /* if imbalance is less than approx 10% of max load, then * observe diminishing returns action. - quit */ - if (imbalance < (max_cpu_irq >> 3)) { - Dprintk("Imbalance too trivial\n"); + if (imbalance < (max_cpu_irq >> 3)) goto not_worth_the_effort; - } tryanotherirq: /* if we select an IRQ to move that can't go where we want, then @@ -629,9 +613,6 @@ tryanotherirq: cpus_and(tmp, target_cpu_mask, allowed_mask); if (!cpus_empty(tmp)) { - - Dprintk("irq = %d moved to cpu = %d\n", - selected_irq, min_loaded); /* mark for change destination */ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); @@ -651,7 +632,6 @@ not_worth_the_effort: */ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - Dprintk("IRQ worth rotating not found\n"); return; } @@ -667,6 +647,7 @@ static int balanced_irq(void *unused) set_pending_irq(i, cpumask_of_cpu(0)); } + set_freezable(); for ( ; ; ) { time_remaining = schedule_timeout_interruptible(time_remaining); try_to_freeze(); @@ -1901,7 +1882,7 @@ __setup("no_timer_check", notimercheck); * - if this function detects that timer IRQs are defunct, then we fall * back to ISA timer IRQs */ -int __init timer_irq_works(void) +static int __init timer_irq_works(void) { unsigned long t1 = jiffies; diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index d2daf672f4a2..dd2b97fc00b2 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -21,7 +21,7 @@ #include <asm/apic.h> #include <asm/uaccess.h> -DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); @@ -149,15 +149,11 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_4KSTACKS -/* - * These should really be __section__(".bss.page_aligned") as well, but - * gcc's 3.0 and earlier don't handle that correctly. - */ static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__aligned__(THREAD_SIZE))); + __attribute__((__section__(".bss.page_aligned"))); static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__aligned__(THREAD_SIZE))); + __attribute__((__section__(".bss.page_aligned"))); /* * allocate per-cpu stacks for hardirq and for softirq processing diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index dde828a333c3..448a50b1324c 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <asm/cacheflush.h> #include <asm/desc.h> #include <asm/uaccess.h> +#include <asm/alternative.h> void jprobe_return_end(void); @@ -169,16 +170,12 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) void __kprobes arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); } void __kprobes arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index fba121f7973f..99beac7f96ce 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -295,7 +295,7 @@ static unsigned int last_irq_sums [NR_CPUS], alert_counter [NR_CPUS]; -void touch_nmi_watchdog (void) +void touch_nmi_watchdog(void) { if (nmi_watchdog > 0) { unsigned cpu; @@ -304,8 +304,10 @@ void touch_nmi_watchdog (void) * Just reset the alert counters, (other CPUs might be * spinning on locks we hold): */ - for_each_present_cpu (cpu) - alert_counter[cpu] = 0; + for_each_present_cpu(cpu) { + if (alert_counter[cpu]) + alert_counter[cpu] = 0; + } } /* @@ -351,7 +353,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) * Take the local apic timer and PIT/HPET into account. We don't * know which one is active, when we have highres/dyntick on */ - sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0]; /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index faab09abca5e..ea962c0667d5 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -124,20 +124,28 @@ unsigned paravirt_patch_ignore(unsigned len) return len; } +struct branch { + unsigned char opcode; + u32 delta; +} __attribute__((packed)); + unsigned paravirt_patch_call(void *target, u16 tgt_clobbers, void *site, u16 site_clobbers, unsigned len) { unsigned char *call = site; unsigned long delta = (unsigned long)target - (unsigned long)(call+5); + struct branch b; if (tgt_clobbers & ~site_clobbers) return len; /* target would clobber too much for this site */ if (len < 5) return len; /* call too long for patch site */ - *call++ = 0xe8; /* call */ - *(unsigned long *)call = delta; + b.opcode = 0xe8; /* call */ + b.delta = delta; + BUILD_BUG_ON(sizeof(b) != 5); + text_poke(call, (unsigned char *)&b, 5); return 5; } @@ -146,12 +154,14 @@ unsigned paravirt_patch_jmp(void *target, void *site, unsigned len) { unsigned char *jmp = site; unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5); + struct branch b; if (len < 5) return len; /* call too long for patch site */ - *jmp++ = 0xe9; /* jmp */ - *(unsigned long *)jmp = delta; + b.opcode = 0xe9; /* jmp */ + b.delta = delta; + text_poke(jmp, (unsigned char *)&b, 5); return 5; } @@ -228,6 +238,41 @@ static int __init print_banner(void) } core_initcall(print_banner); +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +static struct resource reserve_iomem = { + .start = 0, + .end = -1, + .name = "paravirt-iomem", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + int ret; + + ret = request_resource(&ioport_resource, &reserve_ioports); + if (ret == 0) { + ret = request_resource(&iomem_resource, &reserve_iomem); + if (ret) + release_resource(&reserve_ioports); + } + + return ret; +} + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, @@ -267,7 +312,7 @@ struct paravirt_ops paravirt_ops = { .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .get_scheduled_cycles = native_read_tsc, + .sched_clock = native_sched_clock, .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 06dfa65ad180..84664710b784 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -300,6 +300,7 @@ early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; printk("\n"); printk("Pid: %d, comm: %20s\n", current->pid, current->comm); @@ -324,6 +325,17 @@ void show_regs(struct pt_regs * regs) cr3 = read_cr3(); cr4 = read_cr4_safe(); printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR6: %08lx DR7: %08lx\n", d6, d7); + show_trace(NULL, regs, ®s->esp); } @@ -538,8 +550,31 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) return 1; } -static noinline void __switch_to_xtra(struct task_struct *next_p, - struct tss_struct *tss) +#ifdef CONFIG_SECCOMP +void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} +void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} +#endif /* CONFIG_SECCOMP */ + +static noinline void +__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) { struct thread_struct *next; @@ -555,6 +590,17 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, set_debugreg(next->debugreg[7], 7); } +#ifdef CONFIG_SECCOMP + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } +#endif + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -586,33 +632,6 @@ static noinline void __switch_to_xtra(struct task_struct *next_p, } /* - * This function selects if the context switch from prev to next - * has to tweak the TSC disable bit in the cr4. - */ -static inline void disable_tsc(struct task_struct *prev_p, - struct task_struct *next_p) -{ - struct thread_info *prev, *next; - - /* - * gcc should eliminate the ->thread_info dereference if - * has_secure_computing returns 0 at compile time (SECCOMP=n). - */ - prev = task_thread_info(prev_p); - next = task_thread_info(next_p); - - if (has_secure_computing(prev) || has_secure_computing(next)) { - /* slow path here */ - if (has_secure_computing(prev) && - !has_secure_computing(next)) { - write_cr4(read_cr4() & ~X86_CR4_TSD); - } else if (!has_secure_computing(prev) && - has_secure_computing(next)) - write_cr4(read_cr4() | X86_CR4_TSD); - } -} - -/* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time @@ -689,11 +708,9 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Now maybe handle debug registers and/or IO bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))) - __switch_to_xtra(next_p, tss); - - disable_tsc(prev_p, next_p); + if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || + task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) + __switch_to_xtra(prev_p, next_p, tss); /* * Leave lazy mode, flushing any hypercalls made here. diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 0c0ceec5de00..0c8f00e69c4d 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -164,14 +164,22 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ u32 *desc; unsigned long base; - down(&child->mm->context.sem); - desc = child->mm->context.ldt + (seg & ~7); - base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000); + seg &= ~7UL; - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; + down(&child->mm->context.sem); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } up(&child->mm->context.sem); } return addr; @@ -358,17 +366,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) switch (request) { /* when I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { - unsigned long tmp; - int copied; - - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - break; - ret = put_user(tmp, datap); + case PTRACE_PEEKDATA: + ret = generic_ptrace_peekdata(child, addr, data); break; - } /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -395,10 +395,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* when I and D space are separate, this will have to be fixed. */ case PTRACE_POKETEXT: /* write the word at location addr. */ case PTRACE_POKEDATA: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) - break; - ret = -EIO; + ret = generic_ptrace_pokedata(child, addr, data); break; case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 5513f8d5b5be..0d796248866c 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -113,6 +113,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0WF810"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 2d61e65eeb50..d474cd639bcb 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -273,18 +273,18 @@ unsigned long __init find_max_low_pfn(void) printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); else printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_X86_PAE +#ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING "Warning only 4GB will be used.\n"); - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); } -#endif /* !CONFIG_X86_PAE */ +#endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ } else { if (highmem_pages == -1) @@ -466,7 +466,7 @@ void __init setup_bootmem_allocator(void) * * This should all compile down to nothing when NUMA is off. */ -void __init remapped_pgdat_init(void) +static void __init remapped_pgdat_init(void) { int nid; @@ -601,6 +601,8 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ + paravirt_post_allocator_init(); + dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH @@ -638,6 +640,7 @@ void __init setup_arch(char **cmdline_p) #endif e820_register_memory(); + e820_mark_nosave_regions(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index d574e38f0f77..f5dd85656c18 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -199,6 +199,13 @@ asmlinkage int sys_sigreturn(unsigned long __unused) return eax; badframe: + if (show_unhandled_signals && printk_ratelimit()) + printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" + " esp:%lx oeax:%lx\n", + current->pid > 1 ? KERN_INFO : KERN_EMERG, + current->comm, current->pid, frame, regs->eip, + regs->esp, regs->orig_eax); + force_sig(SIGSEGV, current); return 0; } diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 6299c080f6e2..2d35d8502029 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -22,6 +22,7 @@ #include <asm/mtrr.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include <mach_apic.h> /* @@ -249,13 +250,13 @@ static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. * * We need to reload %cr3 since the page tables may be going * away from under us.. */ -static inline void leave_mm (unsigned long cpu) +void leave_mm(unsigned long cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 0b2954534b8e..e4f61d1c6248 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -148,7 +148,7 @@ void __init smp_alloc_memory(void) * a given CPU */ -static void __cpuinit smp_store_cpu_info(int id) +void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; @@ -308,8 +308,7 @@ cpumask_t cpu_coregroup_map(int cpu) /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; -static inline void -set_cpu_sibling_map(int cpu) +void __cpuinit set_cpu_sibling_map(int cpu) { int i; struct cpuinfo_x86 *c = cpu_data; @@ -1144,8 +1143,7 @@ void __init native_smp_prepare_boot_cpu(void) } #ifdef CONFIG_HOTPLUG_CPU -static void -remove_siblinginfo(int cpu) +void remove_siblinginfo(int cpu) { int sibling; struct cpuinfo_x86 *c = cpu_data; diff --git a/arch/i386/kernel/smpcommon.c b/arch/i386/kernel/smpcommon.c index 1868ae18eb4d..bbfe85a0f699 100644 --- a/arch/i386/kernel/smpcommon.c +++ b/arch/i386/kernel/smpcommon.c @@ -47,7 +47,7 @@ int smp_call_function(void (*func) (void *info), void *info, int nonatomic, EXPORT_SYMBOL(smp_call_function); /** - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @cpu: The target CPU. Cannot be the calling CPU. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. @@ -66,9 +66,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int ret; int me = get_cpu(); if (cpu == me) { - WARN_ON(1); + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); - return -EBUSY; + return 0; } ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index bf6adce52267..8344c70adf61 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -323,3 +323,4 @@ ENTRY(sys_call_table) .long sys_signalfd .long sys_timerfd .long sys_eventfd + .long sys_fallocate diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index ff4ee6f3326b..6deb159d08e0 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -336,7 +336,9 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { - return 0; + const struct vm_area_struct *vma = get_gate_vma(task); + + return vma && addr >= vma->vm_start && addr < vma->vm_end; } int in_gate_area_no_task(unsigned long addr) diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index a665df61f08c..19a6c678d02e 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -207,55 +207,9 @@ unsigned long read_persistent_clock(void) return retval; } -static void sync_cmos_clock(unsigned long dummy); - -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); -int no_sync_cmos_clock; - -static void sync_cmos_clock(unsigned long dummy) -{ - struct timeval now, next; - int fail = 1; - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - * This code is run on a timer. If the clock is set, that timer - * may not expire at the correct time. Thus, we adjust... - */ - if (!ntp_synced()) - /* - * Not synced, exit, do not restart a timer (if one is - * running, let it run out). - */ - return; - - do_gettimeofday(&now); - if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) - fail = set_rtc_mmss(now.tv_sec); - - next.tv_usec = USEC_AFTER - now.tv_usec; - if (next.tv_usec <= 0) - next.tv_usec += USEC_PER_SEC; - - if (!fail) - next.tv_sec = 659; - else - next.tv_sec = 0; - - if (next.tv_usec >= USEC_PER_SEC) { - next.tv_sec++; - next.tv_usec -= USEC_PER_SEC; - } - mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); -} - -void notify_arch_cmos_timer(void) +int update_persistent_clock(struct timespec now) { - if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + return set_rtc_mmss(now.tv_sec); } extern void (*late_time_init)(void); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 90da0575fcff..cfffe3dd9e83 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -41,6 +41,10 @@ #include <linux/mca.h> #endif +#if defined(CONFIG_EDAC) +#include <linux/edac.h> +#endif + #include <asm/processor.h> #include <asm/system.h> #include <asm/io.h> @@ -148,7 +152,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; stack = &dummy; - if (task && task != current) + if (task != current) stack = (unsigned long *)task->thread.esp; } @@ -207,6 +211,7 @@ static void print_trace_address(void *data, unsigned long addr) { printk("%s [<%08lx>] ", (char *)data, addr); print_symbol("%s\n", addr); + touch_nmi_watchdog(); } static struct stacktrace_ops print_trace_ops = { @@ -390,7 +395,7 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long esp; unsigned short ss; - report_bug(regs->eip); + report_bug(regs->eip, regs); printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT @@ -433,6 +438,7 @@ void die(const char * str, struct pt_regs * regs, long err) bust_spinlocks(0); die.lock_owner = -1; + add_taint(TAINT_DIE); spin_unlock_irqrestore(&die.lock, flags); if (!regs) @@ -517,10 +523,12 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ fastcall void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ + if (irq) \ + local_irq_enable(); \ info.si_signo = signr; \ info.si_errno = 0; \ info.si_code = sicode; \ @@ -560,13 +568,13 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) fastcall void __kprobes do_general_protection(struct pt_regs * regs, long error_code) @@ -610,6 +618,13 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, current->thread.error_code = error_code; current->thread.trap_no = 13; + if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && + printk_ratelimit()) + printk(KERN_INFO + "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + current->comm, current->pid, + regs->eip, regs->esp, error_code); + force_sig(SIGSEGV, current); return; @@ -635,6 +650,14 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " "CPU %d.\n", reason, smp_processor_id()); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); + +#if defined(CONFIG_EDAC) + if(edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); @@ -752,6 +775,8 @@ static __kprobes void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } +static int ignore_nmis; + fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -762,11 +787,24 @@ fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) ++nmi_count(cpu); - default_do_nmi(regs); + if (!ignore_nmis) + default_do_nmi(regs); nmi_exit(); } +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { @@ -1053,6 +1091,7 @@ asmlinkage void math_state_restore(void) thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } +EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index ea63a30ca3e8..debd7dbb4158 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -27,6 +27,7 @@ static int tsc_enabled; * an extra value to store the TSC freq */ unsigned int tsc_khz; +EXPORT_SYMBOL_GPL(tsc_khz); int tsc_disable; @@ -58,10 +59,11 @@ __setup("notsc", tsc_setup); */ static int tsc_unstable; -static inline int check_tsc_unstable(void) +int check_tsc_unstable(void) { return tsc_unstable; } +EXPORT_SYMBOL_GPL(check_tsc_unstable); /* Accellerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) @@ -84,7 +86,7 @@ static inline int check_tsc_unstable(void) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale __read_mostly; +unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ @@ -93,15 +95,10 @@ static inline void set_cyc2ns_scale(unsigned long cpu_khz) cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; } -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - /* * Scheduler clock - returns current time in nanosec units. */ -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long long this_offset; @@ -118,12 +115,24 @@ unsigned long long sched_clock(void) return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); /* read the Time Stamp Counter: */ - get_scheduled_cycles(this_offset); + rdtscll(this_offset); /* return the value in ns */ return cycles_2_ns(this_offset); } +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long sched_clock(void) + __attribute__((alias("native_sched_clock"))); +#endif + unsigned long native_calculate_cpu_khz(void) { unsigned long long start, end; diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c index c12720d7cbc5..72042bb7ec94 100644 --- a/arch/i386/kernel/vmi.c +++ b/arch/i386/kernel/vmi.c @@ -362,7 +362,7 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pt(u32 pfn) +static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -891,7 +891,7 @@ static inline int __init activate_vmi(void) paravirt_ops.setup_boot_clock = vmi_time_bsp_init; paravirt_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles; + paravirt_ops.sched_clock = vmi_sched_clock; paravirt_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ diff --git a/arch/i386/kernel/vmiclock.c b/arch/i386/kernel/vmiclock.c index 26a37f8a8762..b1b5ab08b26e 100644 --- a/arch/i386/kernel/vmiclock.c +++ b/arch/i386/kernel/vmiclock.c @@ -32,6 +32,7 @@ #include <asm/apicdef.h> #include <asm/apic.h> #include <asm/timer.h> +#include <asm/i8253.h> #include <irq_vectors.h> #include "io_ports.h" @@ -64,10 +65,10 @@ int vmi_set_wallclock(unsigned long now) return 0; } -/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */ -unsigned long long vmi_get_sched_cycles(void) +/* paravirt_ops.sched_clock = vmi_sched_clock */ +unsigned long long vmi_sched_clock(void) { - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE); + return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */ @@ -142,6 +143,7 @@ static void vmi_timer_set_mode(enum clock_event_mode mode, switch (mode) { case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: break; case CLOCK_EVT_MODE_PERIODIC: cycles_per_hz = vmi_timer_ops.get_cycle_frequency(); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index aa87b06c7c82..7d72cce00529 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -60,7 +60,9 @@ SECTIONS __stop___ex_table = .; } - BUG_TABLE + NOTES :text :note + + BUG_TABLE :text . = ALIGN(4); .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { @@ -88,6 +90,7 @@ SECTIONS . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) *(.data.idt) } @@ -180,6 +183,7 @@ SECTIONS .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) + *(.data.percpu.shared_aligned) __per_cpu_end = .; } . = ALIGN(4096); @@ -206,6 +210,4 @@ SECTIONS STABS_DEBUG DWARF_DEBUG - - NOTES } diff --git a/arch/i386/kernel/vsyscall-note.S b/arch/i386/kernel/vsyscall-note.S index d4b5be4f3d5f..07c0daf78237 100644 --- a/arch/i386/kernel/vsyscall-note.S +++ b/arch/i386/kernel/vsyscall-note.S @@ -3,23 +3,43 @@ * Here we can supply some information useful to userland. */ -#include <linux/uts.h> #include <linux/version.h> +#include <linux/elfnote.h> -#define ASM_ELF_NOTE_BEGIN(name, flags, vendor, type) \ - .section name, flags; \ - .balign 4; \ - .long 1f - 0f; /* name length */ \ - .long 3f - 2f; /* data length */ \ - .long type; /* note type */ \ -0: .asciz vendor; /* vendor name */ \ -1: .balign 4; \ -2: +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END -#define ASM_ELF_NOTE_END \ -3: .balign 4; /* pad out section */ \ - .previous +#ifdef CONFIG_XEN +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + * + * At runtime, the fake hardware feature will be considered to be present + * if its bit is set in the mask word. So, we start with the mask 0, and + * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. + */ - ASM_ELF_NOTE_BEGIN(".note.kernel-version", "a", UTS_SYSNAME, 0) - .long LINUX_VERSION_CODE - ASM_ELF_NOTE_END +#include "../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ + + .globl VDSO_NOTE_MASK +ELFNOTE_START(GNU, 2, "a") + .long 1 /* ncaps */ +VDSO_NOTE_MASK: + .long 0 /* mask */ + .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ +ELFNOTE_END +#endif diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile index 22d8ac5815f0..4d105fdfe817 100644 --- a/arch/i386/lib/Makefile +++ b/arch/i386/lib/Makefile @@ -4,7 +4,7 @@ lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \ - bitops.o semaphore.o + bitops.o semaphore.o string.o lib-$(CONFIG_X86_USE_3DNOW) += mmx.o diff --git a/arch/i386/lib/string.c b/arch/i386/lib/string.c new file mode 100644 index 000000000000..2c773fefa3dd --- /dev/null +++ b/arch/i386/lib/string.c @@ -0,0 +1,257 @@ +/* + * Most of the string-functions are rather heavily hand-optimized, + * see especially strsep,strstr,str[c]spn. They should work, but are not + * very easy to understand. Everything is done entirely within the register + * set, making the functions fast and clean. String instructions have been + * used through-out, making for "slightly" unclear code :-) + * + * AK: On P4 and K7 using non string instruction implementations might be faster + * for large memory blocks. But most of them are unlikely to be used on large + * strings. + */ + +#include <linux/string.h> +#include <linux/module.h> + +#ifdef __HAVE_ARCH_STRCPY +char *strcpy(char * dest,const char *src) +{ + int d0, d1, d2; + asm volatile( "1:\tlodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b" + : "=&S" (d0), "=&D" (d1), "=&a" (d2) + :"0" (src),"1" (dest) : "memory"); + return dest; +} +EXPORT_SYMBOL(strcpy); +#endif + +#ifdef __HAVE_ARCH_STRNCPY +char *strncpy(char * dest,const char *src,size_t count) +{ + int d0, d1, d2, d3; + asm volatile( "1:\tdecl %2\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "rep\n\t" + "stosb\n" + "2:" + : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) + :"0" (src),"1" (dest),"2" (count) : "memory"); + return dest; +} +EXPORT_SYMBOL(strncpy); +#endif + +#ifdef __HAVE_ARCH_STRCAT +char *strcat(char * dest,const char * src) +{ + int d0, d1, d2, d3; + asm volatile( "repne\n\t" + "scasb\n\t" + "decl %1\n" + "1:\tlodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b" + : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) + : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); + return dest; +} +EXPORT_SYMBOL(strcat); +#endif + +#ifdef __HAVE_ARCH_STRNCAT +char *strncat(char * dest,const char * src,size_t count) +{ + int d0, d1, d2, d3; + asm volatile( "repne\n\t" + "scasb\n\t" + "decl %1\n\t" + "movl %8,%3\n" + "1:\tdecl %3\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n" + "2:\txorl %2,%2\n\t" + "stosb" + : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) + : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count) + : "memory"); + return dest; +} +EXPORT_SYMBOL(strncat); +#endif + +#ifdef __HAVE_ARCH_STRCMP +int strcmp(const char * cs,const char * ct) +{ + int d0, d1; + int res; + asm volatile( "1:\tlodsb\n\t" + "scasb\n\t" + "jne 2f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "xorl %%eax,%%eax\n\t" + "jmp 3f\n" + "2:\tsbbl %%eax,%%eax\n\t" + "orb $1,%%al\n" + "3:" + :"=a" (res), "=&S" (d0), "=&D" (d1) + :"1" (cs),"2" (ct) + :"memory"); + return res; +} +EXPORT_SYMBOL(strcmp); +#endif + +#ifdef __HAVE_ARCH_STRNCMP +int strncmp(const char * cs,const char * ct,size_t count) +{ + int res; + int d0, d1, d2; + asm volatile( "1:\tdecl %3\n\t" + "js 2f\n\t" + "lodsb\n\t" + "scasb\n\t" + "jne 3f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n" + "2:\txorl %%eax,%%eax\n\t" + "jmp 4f\n" + "3:\tsbbl %%eax,%%eax\n\t" + "orb $1,%%al\n" + "4:" + :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) + :"1" (cs),"2" (ct),"3" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(strncmp); +#endif + +#ifdef __HAVE_ARCH_STRCHR +char *strchr(const char * s, int c) +{ + int d0; + char * res; + asm volatile( "movb %%al,%%ah\n" + "1:\tlodsb\n\t" + "cmpb %%ah,%%al\n\t" + "je 2f\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "movl $1,%1\n" + "2:\tmovl %1,%0\n\t" + "decl %0" + :"=a" (res), "=&S" (d0) + :"1" (s),"0" (c) + :"memory"); + return res; +} +EXPORT_SYMBOL(strchr); +#endif + +#ifdef __HAVE_ARCH_STRRCHR +char *strrchr(const char * s, int c) +{ + int d0, d1; + char * res; + asm volatile( "movb %%al,%%ah\n" + "1:\tlodsb\n\t" + "cmpb %%ah,%%al\n\t" + "jne 2f\n\t" + "leal -1(%%esi),%0\n" + "2:\ttestb %%al,%%al\n\t" + "jne 1b" + :"=g" (res), "=&S" (d0), "=&a" (d1) + :"0" (0),"1" (s),"2" (c) + :"memory"); + return res; +} +EXPORT_SYMBOL(strrchr); +#endif + +#ifdef __HAVE_ARCH_STRLEN +size_t strlen(const char * s) +{ + int d0; + int res; + asm volatile( "repne\n\t" + "scasb\n\t" + "notl %0\n\t" + "decl %0" + :"=c" (res), "=&D" (d0) + :"1" (s),"a" (0), "0" (0xffffffffu) + :"memory"); + return res; +} +EXPORT_SYMBOL(strlen); +#endif + +#ifdef __HAVE_ARCH_MEMCHR +void *memchr(const void *cs,int c,size_t count) +{ + int d0; + void *res; + if (!count) + return NULL; + asm volatile( "repne\n\t" + "scasb\n\t" + "je 1f\n\t" + "movl $1,%0\n" + "1:\tdecl %0" + :"=D" (res), "=&c" (d0) + :"a" (c),"0" (cs),"1" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(memchr); +#endif + +#ifdef __HAVE_ARCH_MEMSCAN +void *memscan(void * addr, int c, size_t size) +{ + if (!size) + return addr; + asm volatile("repnz; scasb\n\t" + "jnz 1f\n\t" + "dec %%edi\n" + "1:" + : "=D" (addr), "=c" (size) + : "0" (addr), "1" (size), "a" (c) + : "memory"); + return addr; +} +EXPORT_SYMBOL(memscan); +#endif + +#ifdef __HAVE_ARCH_STRNLEN +size_t strnlen(const char *s, size_t count) +{ + int d0; + int res; + asm volatile( "movl %2,%0\n\t" + "jmp 2f\n" + "1:\tcmpb $0,(%0)\n\t" + "je 3f\n\t" + "incl %0\n" + "2:\tdecl %1\n\t" + "cmpl $-1,%1\n\t" + "jne 1b\n" + "3:\tsubl %2,%0" + :"=a" (res), "=&d" (d0) + :"c" (s),"1" (count) + :"memory"); + return res; +} +EXPORT_SYMBOL(strnlen); +#endif diff --git a/arch/i386/mach-generic/es7000.c b/arch/i386/mach-generic/es7000.c index b47f951c0ec2..4742626f08c4 100644 --- a/arch/i386/mach-generic/es7000.c +++ b/arch/i386/mach-generic/es7000.c @@ -66,4 +66,4 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif -struct genapic apic_es7000 = APIC_INIT("es7000", probe_es7000); +struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/i386/mach-voyager/voyager_thread.c b/arch/i386/mach-voyager/voyager_thread.c index b4b24e0e45e1..f9d595338159 100644 --- a/arch/i386/mach-voyager/voyager_thread.c +++ b/arch/i386/mach-voyager/voyager_thread.c @@ -52,7 +52,7 @@ execute(const char *string) NULL, }; - if ((ret = call_usermodehelper(argv[0], argv, envp, 1)) != 0) { + if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, ret); } diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 1ecb3e43b523..01ffdd4964f0 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -283,6 +283,8 @@ static inline int vmalloc_fault(unsigned long address) return 0; } +int show_unhandled_signals = 1; + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate @@ -303,6 +305,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, struct vm_area_struct * vma; unsigned long address; int write, si_code; + int fault; /* get the address */ address = read_cr2(); @@ -422,20 +425,18 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - switch (handle_mm_fault(mm, vma, address, write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - goto do_sigbus; - case VM_FAULT_OOM: + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) goto out_of_memory; - default: - BUG(); + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); } + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; /* * Did it hit the DOS screen memory VA from vm86 mode? @@ -470,6 +471,14 @@ bad_area_nosemaphore: if (is_prefetch(regs, address, error_code)) return; + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk("%s%s[%d]: segfault at %08lx eip %08lx " + "esp %08lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->eip, + regs->esp, error_code); + } tsk->thread.cr2 = address; /* Kernel addresses are always protection faults */ tsk->thread.error_code = error_code | (address >= TASK_SIZE); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 7135946d3663..c3b9905af2d5 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -87,7 +87,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -471,8 +471,13 @@ void zap_low_mappings (void) flush_tlb_all(); } +int nx_enabled = 0; + +#ifdef CONFIG_X86_PAE + static int disable_nx __initdata = 0; u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); /* * noexec = on|off @@ -499,9 +504,6 @@ static int __init noexec_setup(char *str) } early_param("noexec", noexec_setup); -int nx_enabled = 0; -#ifdef CONFIG_X86_PAE - static void __init set_nx(void) { unsigned int v[4], l, h; @@ -751,8 +753,7 @@ void __init pgtable_cache_init(void) PTRS_PER_PMD*sizeof(pmd_t), PTRS_PER_PMD*sizeof(pmd_t), SLAB_PANIC, - pmd_ctor, - NULL); + pmd_ctor); if (!SHARED_KERNEL_PMD) { /* If we're in PAE mode and have a non-shared kernel pmd, then the pgd size must be a @@ -799,17 +800,9 @@ void mark_rodata_ro(void) unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_KPROBES -#ifdef CONFIG_HOTPLUG_CPU - /* It must still be possible to apply SMP alternatives. */ - if (num_possible_cpus() <= 1) -#endif - { - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RX); - printk("Write protecting the kernel text: %luk\n", size >> 10); - } -#endif + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); start += size; size = (unsigned long)__end_rodata - start; change_page_attr(virt_to_page(start), diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index fff08ae7b5ed..0b278315d737 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c @@ -196,7 +196,7 @@ void iounmap(volatile void __iomem *addr) /* Reset the direct mapping. Can block */ if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { change_page_attr(virt_to_page(__va(p->phys_addr)), - p->size >> PAGE_SHIFT, + get_vm_area_size(p) >> PAGE_SHIFT, PAGE_KERNEL); global_flush_tlb(); } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 2eb14a73be9c..8927222b3ab2 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -60,7 +60,7 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(page_to_pfn(base)); + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, addr == address ? prot : ref_prot)); @@ -82,7 +82,7 @@ static void flush_kernel_map(void *arg) struct page *p; /* High level code is not ready for clflush yet */ - if (0 && cpu_has_clflush) { + if (cpu_has_clflush) { list_for_each_entry (p, lh, lru) cache_flush_page(p); } else if (boot_cpu_data.x86_model >= 4) @@ -136,6 +136,12 @@ static inline void revert_page(struct page *kpte_page, unsigned long address) ref_prot)); } +static inline void save_page(struct page *kpte_page) +{ + if (!test_and_set_bit(PG_arch_1, &kpte_page->flags)) + list_add(&kpte_page->lru, &df_list); +} + static int __change_page_attr(struct page *page, pgprot_t prot) { @@ -150,6 +156,9 @@ __change_page_attr(struct page *page, pgprot_t prot) if (!kpte) return -EINVAL; kpte_page = virt_to_page(kpte); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { if (!pte_huge(*kpte)) { set_pte_atomic(kpte, mk_pte(page, prot)); @@ -179,11 +188,11 @@ __change_page_attr(struct page *page, pgprot_t prot) * time (not via split_large_page) and in turn we must not * replace it with a largepage. */ + + save_page(kpte_page); if (!PageReserved(kpte_page)) { if (cpu_has_pse && (page_private(kpte_page) == 0)) { - ClearPagePrivate(kpte_page); paravirt_release_pt(page_to_pfn(kpte_page)); - list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } } @@ -236,6 +245,11 @@ void global_flush_tlb(void) spin_unlock_irq(&cpa_lock); flush_map(&l); list_for_each_entry_safe(pg, next, &l, lru) { + list_del(&pg->lru); + clear_bit(PG_arch_1, &pg->flags); + if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0) + continue; + ClearPagePrivate(pg); __free_page(pg); } } diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 8d7c0864cc04..01437c46baae 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -235,7 +235,7 @@ static inline void pgd_list_del(pgd_t *pgd) #if (PTRS_PER_PMD == 1) /* Non-PAE pgd constructor */ -void pgd_ctor(void *pgd) +static void pgd_ctor(void *pgd) { unsigned long flags; @@ -257,7 +257,7 @@ void pgd_ctor(void *pgd) } #else /* PTRS_PER_PMD > 1 */ /* PAE pgd constructor */ -void pgd_ctor(void *pgd) +static void pgd_ctor(void *pgd) { /* PAE, kernel PMD may be shared */ @@ -276,7 +276,7 @@ void pgd_ctor(void *pgd) } #endif /* PTRS_PER_PMD */ -void pgd_dtor(void *pgd) +static void pgd_dtor(void *pgd) { unsigned long flags; /* can be called from interrupt context */ diff --git a/arch/i386/pci/acpi.c b/arch/i386/pci/acpi.c index b33aea845f58..bc8a44bddaa7 100644 --- a/arch/i386/pci/acpi.c +++ b/arch/i386/pci/acpi.c @@ -8,20 +8,42 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) { struct pci_bus *bus; + struct pci_sysdata *sd; + int pxm; + + /* Allocate per-root-bus (not per bus) arch-specific data. + * TODO: leak; this memory is never freed. + * It's arguable whether it's worth the trouble to care. + */ + sd = kzalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) { + printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + return NULL; + } if (domain != 0) { printk(KERN_WARNING "PCI: Multiple domains not supported\n"); + kfree(sd); return NULL; } - bus = pcibios_scan_root(busnum); + sd->node = -1; + + pxm = acpi_get_pxm(device->handle); +#ifdef CONFIG_ACPI_NUMA + if (pxm >= 0) + sd->node = pxm_to_node(pxm); +#endif + + bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + if (!bus) + kfree(sd); + #ifdef CONFIG_ACPI_NUMA if (bus != NULL) { - int pxm = acpi_get_pxm(device->handle); if (pxm >= 0) { - bus->sysdata = (void *)(unsigned long)pxm_to_node(pxm); - printk("bus %d -> pxm %d -> node %ld\n", - busnum, pxm, (long)(bus->sysdata)); + printk("bus %d -> pxm %d -> node %d\n", + busnum, pxm, sd->node); } } #endif diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 3f78d4d8ecf3..85503deeda46 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -293,6 +293,7 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { struct pci_bus * __devinit pcibios_scan_root(int busnum) { struct pci_bus *bus = NULL; + struct pci_sysdata *sd; dmi_check_system(pciprobe_dmi_table); @@ -303,9 +304,19 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) } } + /* Allocate per-root-bus (not per bus) arch-specific data. + * TODO: leak; this memory is never freed. + * It's arguable whether it's worth the trouble to care. + */ + sd = kzalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) { + printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + return NULL; + } + printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, NULL); + return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); } extern u8 pci_cache_line_size; diff --git a/arch/i386/pci/mmconfig-shared.c b/arch/i386/pci/mmconfig-shared.c index c7cabeed4d7b..4df637e34f81 100644 --- a/arch/i386/pci/mmconfig-shared.c +++ b/arch/i386/pci/mmconfig-shared.c @@ -24,6 +24,9 @@ DECLARE_BITMAP(pci_mmcfg_fallback_slots, 32*PCI_MMCFG_MAX_CHECK_BUS); +/* Indicate if the mmcfg resources have been placed into the resource table. */ +static int __initdata pci_mmcfg_resources_inserted; + /* K8 systems have some devices (typically in the builtin northbridge) that are only accessible using type1 Normally this can be expressed in the MCFG by not listing them @@ -170,7 +173,7 @@ static int __init pci_mmcfg_check_hostbridge(void) return name != NULL; } -static void __init pci_mmcfg_insert_resources(void) +static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) { #define PCI_MMCFG_RESOURCE_NAME_LEN 19 int i; @@ -194,10 +197,13 @@ static void __init pci_mmcfg_insert_resources(void) cfg->pci_segment); res->start = cfg->address; res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->flags = IORESOURCE_MEM | resource_flags; insert_resource(&iomem_resource, res); names += PCI_MMCFG_RESOURCE_NAME_LEN; } + + /* Mark that the resources have been inserted. */ + pci_mmcfg_resources_inserted = 1; } static void __init pci_mmcfg_reject_broken(int type) @@ -267,7 +273,43 @@ void __init pci_mmcfg_init(int type) if (type == 1) unreachable_devices(); if (known_bridge) - pci_mmcfg_insert_resources(); + pci_mmcfg_insert_resources(IORESOURCE_BUSY); pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; + } else { + /* + * Signal not to attempt to insert mmcfg resources because + * the architecture mmcfg setup could not initialize. + */ + pci_mmcfg_resources_inserted = 1; } } + +static int __init pci_mmcfg_late_insert_resources(void) +{ + /* + * If resources are already inserted or we are not using MMCONFIG, + * don't insert the resources. + */ + if ((pci_mmcfg_resources_inserted == 1) || + (pci_probe & PCI_PROBE_MMCONF) == 0 || + (pci_mmcfg_config_num == 0) || + (pci_mmcfg_config == NULL) || + (pci_mmcfg_config[0].address == 0)) + return 1; + + /* + * Attempt to insert the mmcfg resources but not with the busy flag + * marked so it won't cause request errors when __request_region is + * called. + */ + pci_mmcfg_insert_resources(0); + + return 0; +} + +/* + * Perform MMCONFIG resource insertion after PCI initialization to allow for + * misprogrammed MCFG tables that state larger sizes but actually conflict + * with other system resources. + */ +late_initcall(pci_mmcfg_late_insert_resources); diff --git a/arch/i386/video/Makefile b/arch/i386/video/Makefile new file mode 100644 index 000000000000..2c447c94adcc --- /dev/null +++ b/arch/i386/video/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_FB) += fbdev.o diff --git a/arch/i386/video/fbdev.c b/arch/i386/video/fbdev.c new file mode 100644 index 000000000000..48fb38d7d2c0 --- /dev/null +++ b/arch/i386/video/fbdev.c @@ -0,0 +1,32 @@ +/* + * arch/i386/video/fbdev.c - i386 Framebuffer + * + * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. + * + */ +#include <linux/fb.h> +#include <linux/pci.h> + +int fb_is_primary_device(struct fb_info *info) +{ + struct device *device = info->device; + struct pci_dev *pci_dev = NULL; + struct resource *res = NULL; + int retval = 0; + + if (device) + pci_dev = to_pci_dev(device); + + if (pci_dev) + res = &pci_dev->resource[PCI_ROM_RESOURCE]; + + if (res && res->flags & IORESOURCE_ROM_SHADOW) + retval = 1; + + return retval; +} +EXPORT_SYMBOL(fb_is_primary_device); diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig new file mode 100644 index 000000000000..9df99e1885a4 --- /dev/null +++ b/arch/i386/xen/Kconfig @@ -0,0 +1,11 @@ +# +# This Kconfig describes xen options +# + +config XEN + bool "Enable support for Xen hypervisor" + depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES + help + This is the Linux Xen port. Enabling this will allow the + kernel to boot in a paravirtualized environment under the + Xen hypervisor. diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile new file mode 100644 index 000000000000..343df246bd3e --- /dev/null +++ b/arch/i386/xen/Makefile @@ -0,0 +1,4 @@ +obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ + events.o time.o manage.o xen-asm.o + +obj-$(CONFIG_SMP) += smp.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c new file mode 100644 index 000000000000..9a8c1181c001 --- /dev/null +++ b/arch/i386/xen/enlighten.c @@ -0,0 +1,1144 @@ +/* + * Core of Xen paravirt_ops implementation. + * + * This file contains the xen_paravirt_ops structure itself, and the + * implementations for: + * - privileged instructions + * - interrupt flags + * - segment operations + * - booting and setup + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/preempt.h> +#include <linux/hardirq.h> +#include <linux/percpu.h> +#include <linux/delay.h> +#include <linux/start_kernel.h> +#include <linux/sched.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/highmem.h> +#include <linux/smp.h> + +#include <xen/interface/xen.h> +#include <xen/interface/physdev.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/sched.h> +#include <xen/features.h> +#include <xen/page.h> + +#include <asm/paravirt.h> +#include <asm/page.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/fixmap.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/reboot.h> + +#include "xen-ops.h" +#include "mmu.h" +#include "multicalls.h" + +EXPORT_SYMBOL_GPL(hypercall_page); + +DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); +DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU(unsigned long, xen_cr3); + +struct start_info *xen_start_info; +EXPORT_SYMBOL_GPL(xen_start_info); + +static /* __initdata */ struct shared_info dummy_shared_info; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; + +/* + * Flag to determine whether vcpu info placement is available on all + * VCPUs. We assume it is to start with, and then set it to zero on + * the first failure. This is because it can succeed on some VCPUs + * and not others, since it can involve hypervisor memory allocation, + * or because the guest failed to guarantee all the appropriate + * constraints on all VCPUs (ie buffer can't cross a page boundary). + * + * Note that any particular CPU may be using a placed vcpu structure, + * but we can only optimise if the all are. + * + * 0: not available, 1: available + */ +static int have_vcpu_info_placement = 1; + +static void __init xen_vcpu_setup(int cpu) +{ + struct vcpu_register_vcpu_info info; + int err; + struct vcpu_info *vcpup; + + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + + if (!have_vcpu_info_placement) + return; /* already tested, not available */ + + vcpup = &per_cpu(xen_vcpu_info, cpu); + + info.mfn = virt_to_mfn(vcpup); + info.offset = offset_in_page(vcpup); + + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + cpu, vcpup, info.mfn, info.offset); + + /* Check to see if the hypervisor will put the vcpu_info + structure where we want it, which allows direct access via + a percpu-variable. */ + err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); + + if (err) { + printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); + have_vcpu_info_placement = 0; + } else { + /* This cpu is using the registered vcpu info, even if + later ones fail to. */ + per_cpu(xen_vcpu, cpu) = vcpup; + + printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", + cpu, vcpup); + } +} + +static void __init xen_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); + printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); +} + +static void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + unsigned maskedx = ~0; + + /* + * Mask out inconvenient features, to try and disable as many + * unsupported kernel subsystems as possible. + */ + if (*eax == 1) + maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ + (1 << X86_FEATURE_ACPI) | /* disable ACPI */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); + *edx &= maskedx; +} + +static void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + } +} + +static void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) +{ + BUG_ON(preemptible()); + + switch (mode) { + case PARAVIRT_LAZY_NONE: + BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); + break; + + case PARAVIRT_LAZY_MMU: + case PARAVIRT_LAZY_CPU: + BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); + break; + + case PARAVIRT_LAZY_FLUSH: + /* flush if necessary, but don't change state */ + if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) + xen_mc_flush(); + return; + } + + xen_mc_flush(); + x86_write_percpu(xen_lazy_mode, mode); +} + +static unsigned long xen_store_tr(void) +{ + return 0; +} + +static void xen_set_ldt(const void *addr, unsigned entries) +{ + unsigned long linear_addr = (unsigned long)addr; + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_SET_LDT; + if (linear_addr) { + /* ldt my be vmalloced, use arbitrary_virt_to_machine */ + xmaddr_t maddr; + maddr = arbitrary_virt_to_machine((unsigned long)addr); + linear_addr = (unsigned long)maddr.maddr; + } + op->arg1.linear_addr = linear_addr; + op->arg2.nr_ents = entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_load_gdt(const struct Xgt_desc_struct *dtr) +{ + unsigned long *frames; + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + int f; + struct multicall_space mcs; + + /* A GDT can be up to 64k in size, which corresponds to 8192 + 8-byte entries, or 16 4k pages.. */ + + BUG_ON(size > 65536); + BUG_ON(va & ~PAGE_MASK); + + mcs = xen_mc_entry(sizeof(*frames) * pages); + frames = mcs.args; + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly((void *)va); + } + + MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + struct multicall_space mc = __xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); + + /* + * XXX sleazy hack: If we're being called in a lazy-cpu zone, + * it means we're in a context switch, and %gs has just been + * saved. This means we can zero it out to prevent faults on + * exit from the hypervisor if the next process has no %gs. + * Either way, it has been saved, and the new value will get + * loaded properly. This will go away as soon as Xen has been + * modified to not save/restore %gs for normal hypercalls. + */ + if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) + loadsegment(gs, 0); +} + +static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, + u32 low, u32 high) +{ + unsigned long lp = (unsigned long)&dt[entrynum]; + xmaddr_t mach_lp = virt_to_machine(lp); + u64 entry = (u64)high << 32 | low; + + preempt_disable(); + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); + + preempt_enable(); +} + +static int cvt_gate_to_trap(int vector, u32 low, u32 high, + struct trap_info *info) +{ + u8 type, dpl; + + type = (high >> 8) & 0x1f; + dpl = (high >> 13) & 3; + + if (type != 0xf && type != 0xe) + return 0; + + info->vector = vector; + info->address = (high & 0xffff0000) | (low & 0x0000ffff); + info->cs = low >> 16; + info->flags = dpl; + /* interrupt gates clear IF */ + if (type == 0xe) + info->flags |= 4; + + return 1; +} + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, + u32 low, u32 high) +{ + unsigned long p = (unsigned long)&dt[entrynum]; + unsigned long start, end; + + preempt_disable(); + + start = __get_cpu_var(idt_desc).address; + end = start + __get_cpu_var(idt_desc).size + 1; + + xen_mc_flush(); + + write_dt_entry(dt, entrynum, low, high); + + if (p >= start && (p + 8) <= end) { + struct trap_info info[2]; + + info[1].address = 0; + + if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } + + preempt_enable(); +} + +static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, + struct trap_info *traps) +{ + unsigned in, out, count; + + count = (desc->size+1) / 8; + BUG_ON(count > 256); + + for (in = out = 0; in < count; in++) { + const u32 *entry = (u32 *)(desc->address + in * 8); + + if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + out++; + } + traps[out].address = 0; +} + +void xen_copy_trap_info(struct trap_info *traps) +{ + const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); + + xen_convert_trap_info(desc, traps); +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static void xen_load_idt(const struct Xgt_desc_struct *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + spin_lock(&lock); + + __get_cpu_var(idt_desc) = *desc; + + xen_convert_trap_info(desc, traps); + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static void xen_write_gdt_entry(struct desc_struct *dt, int entry, + u32 low, u32 high) +{ + preempt_disable(); + + switch ((high >> 8) & 0xff) { + case DESCTYPE_LDT: + case DESCTYPE_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + u64 desc = (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) + BUG(); + } + + } + + preempt_enable(); +} + +static void xen_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + struct multicall_space mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +static void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +} + +static void xen_io_delay(void) +{ +} + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long xen_apic_read(unsigned long reg) +{ + return 0; +} + +static void xen_apic_write(unsigned long reg, unsigned long val) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} +#endif + +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, + unsigned long va) +{ + struct { + struct mmuext_op op; + cpumask_t mask; + } *args; + cpumask_t cpumask = *cpus; + struct multicall_space mcs; + + /* + * A couple of (to be removed) sanity checks: + * + * - current CPU must not be in mask + * - mask must exist :) + */ + BUG_ON(cpus_empty(cpumask)); + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); + + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->mask = cpumask; + args->op.arg2.vcpumask = &args->mask; + + if (va == TLB_FLUSH_ALL) { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + } else { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = va; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static void xen_write_cr2(unsigned long cr2) +{ + x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return x86_read_percpu(xen_vcpu)->arch.cr2; +} + +static unsigned long xen_read_cr2_direct(void) +{ + return x86_read_percpu(xen_vcpu_info.arch.cr2); +} + +static void xen_write_cr4(unsigned long cr4) +{ + /* never allow TSC to be disabled */ + native_write_cr4(cr4 & ~X86_CR4_TSD); +} + +static unsigned long xen_read_cr3(void) +{ + return x86_read_percpu(xen_cr3); +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + if (cr3 == x86_read_percpu(xen_cr3)) { + /* just a simple tlb flush */ + xen_flush_tlb(); + return; + } + + x86_write_percpu(xen_cr3, cr3); + + + { + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_CPU); + } +} + +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) +{ + BUG_ON(mem_map); /* should only be used early */ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(virt_to_page(mm->pgd))) { + SetPagePinned(page); + + if (!PageHighMem(page)) + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + else + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } +} + +/* This should never happen until we're OK to use struct page */ +static void xen_release_pt(u32 pfn) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(page)) { + if (!PageHighMem(page)) + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } +} + +#ifdef CONFIG_HIGHPTE +static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) +{ + pgprot_t prot = PAGE_KERNEL; + + if (PagePinned(page)) + prot = PAGE_KERNEL_RO; + + if (0 && PageHighMem(page)) + printk("mapping highpte %lx type %d prot %s\n", + page_to_pfn(page), type, + (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); + + return kmap_atomic_prot(page, type, prot); +} +#endif + +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); + + return pte; +} + +/* Init-time set_pte while constructing initial pagetables, which + doesn't allow RO pagetable pages to be remapped RW */ +static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + pte = mask_rw_pte(ptep, pte); + + xen_set_pte(ptep, pte); +} + +static __init void xen_pagetable_setup_start(pgd_t *base) +{ + pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + + /* special set_pte for pagetable initialization */ + paravirt_ops.set_pte = xen_set_pte_init; + + init_mm.pgd = base; + /* + * copy top-level of Xen-supplied pagetable into place. For + * !PAE we can use this as-is, but for PAE it is a stand-in + * while we copy the pmd pages. + */ + memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) { + int i; + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); + + make_lowmem_page_readonly(pmd); + + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); + } + } + + /* make sure zero_page is mapped RO so we can use it in pagetables */ + make_lowmem_page_readonly(empty_zero_page); + make_lowmem_page_readonly(base); + /* + * Switch to new pagetable. This is done before + * pagetable_init has done anything so that the new pages + * added to the table can be prepared properly for Xen. + */ + xen_write_cr3(__pa(base)); +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + paravirt_ops.alloc_pt = xen_alloc_pt; + paravirt_ops.set_pte = xen_set_pte; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* + * Create a mapping for the shared info page. + * Should be set_fixmap(), but shared_info is a machine + * address with no corresponding pseudo-phys address. + */ + set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + PFN_DOWN(xen_start_info->shared_info), + PAGE_KERNEL); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + + } else + HYPERVISOR_shared_info = + (struct shared_info *)__va(xen_start_info->shared_info); + + /* Actually pin the pagetable down, but we can't set PG_pinned + yet because the page structures don't exist yet. */ + { + struct mmuext_op op; +#ifdef CONFIG_X86_PAE + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L3_TABLE; +#endif + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); + } +} + +/* This is called once we have the cpu_possible_map */ +void __init xen_setup_vcpu_info_placement(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + xen_vcpu_setup(cpu); + + /* xen_vcpu_setup managed to place the vcpu_info within the + percpu area for all cpus, so make use of it */ + if (have_vcpu_info_placement) { + printk(KERN_INFO "Xen: using vcpu_info placement\n"); + + paravirt_ops.save_fl = xen_save_fl_direct; + paravirt_ops.restore_fl = xen_restore_fl_direct; + paravirt_ops.irq_disable = xen_irq_disable_direct; + paravirt_ops.irq_enable = xen_irq_enable_direct; + paravirt_ops.read_cr2 = xen_read_cr2_direct; + paravirt_ops.iret = xen_iret_direct; + } +} + +static unsigned xen_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + char *start, *end, *reloc; + unsigned ret; + + start = end = reloc = NULL; + +#define SITE(x) \ + case PARAVIRT_PATCH(x): \ + if (have_vcpu_info_placement) { \ + start = (char *)xen_##x##_direct; \ + end = xen_##x##_direct_end; \ + reloc = xen_##x##_direct_reloc; \ + } \ + goto patch_site + + switch (type) { + SITE(irq_enable); + SITE(irq_disable); + SITE(save_fl); + SITE(restore_fl); +#undef SITE + + patch_site: + if (start == NULL || (end-start) > len) + goto default_patch; + + ret = paravirt_patch_insns(insns, len, start, end); + + /* Note: because reloc is assigned from something that + appears to be an array, gcc assumes it's non-null, + but doesn't know its relationship with start and + end. */ + if (reloc > start && reloc < end) { + int reloc_off = reloc - start; + long *relocp = (long *)(insns + reloc_off); + long delta = start - (char *)insns; + + *relocp += delta; + } + break; + + default_patch: + default: + ret = paravirt_patch_default(type, clobbers, insns, len); + break; + } + + return ret; +} + +static const struct paravirt_ops xen_paravirt_ops __initdata = { + .paravirt_enabled = 1, + .shared_kernel_pmd = 0, + + .name = "Xen", + .banner = xen_banner, + + .patch = xen_patch, + + .memory_setup = xen_memory_setup, + .arch_setup = xen_arch_setup, + .init_IRQ = xen_init_IRQ, + .post_allocator_init = xen_mark_init_mm_pinned, + + .time_init = xen_time_init, + .set_wallclock = xen_set_wallclock, + .get_wallclock = xen_get_wallclock, + .get_cpu_khz = xen_cpu_khz, + .sched_clock = xen_sched_clock, + + .cpuid = xen_cpuid, + + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, + + .clts = native_clts, + + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = xen_write_cr4, + + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, + .wbinvd = native_wbinvd, + + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + + .iret = (void *)&hypercall_page[__HYPERVISOR_iret], + .irq_enable_sysexit = NULL, /* never called */ + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_esp0 = xen_load_esp0, + + .set_iopl_mask = xen_set_iopl_mask, + .io_delay = xen_io_delay, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = xen_apic_write, + .apic_write_atomic = xen_apic_write, + .apic_read = xen_apic_read, + .setup_boot_clock = paravirt_nop, + .setup_secondary_clock = paravirt_nop, + .startup_ipi_hook = paravirt_nop, +#endif + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .alloc_pt = xen_alloc_pt_init, + .release_pt = xen_release_pt, + .alloc_pd = paravirt_nop, + .alloc_pd_clone = paravirt_nop, + .release_pd = paravirt_nop, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = xen_kmap_atomic_pte, +#endif + + .set_pte = NULL, /* see xen_pagetable_setup_* */ + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd, + + .pte_val = xen_pte_val, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pgd = xen_make_pgd, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .set_pte_present = xen_set_pte_at, + .set_pud = xen_set_pud, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, + + .make_pmd = xen_make_pmd, + .pmd_val = xen_pmd_val, +#endif /* PAE */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .set_lazy_mode = xen_set_lazy_mode, +}; + +#ifdef CONFIG_SMP +static const struct smp_ops xen_smp_ops __initdata = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .cpu_up = xen_cpu_up, + .smp_cpus_done = xen_smp_cpus_done, + + .smp_send_stop = xen_smp_send_stop, + .smp_send_reschedule = xen_smp_send_reschedule, + .smp_call_function_mask = xen_smp_call_function_mask, +}; +#endif /* CONFIG_SMP */ + +static void xen_reboot(int reason) +{ +#ifdef CONFIG_SMP + smp_send_stop(); +#endif + + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) + BUG(); +} + +static void xen_restart(char *msg) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_emergency_restart(void) +{ + xen_reboot(SHUTDOWN_reboot); +} + +static void xen_machine_halt(void) +{ + xen_reboot(SHUTDOWN_poweroff); +} + +static void xen_crash_shutdown(struct pt_regs *regs) +{ + xen_reboot(SHUTDOWN_crash); +} + +static const struct machine_ops __initdata xen_machine_ops = { + .restart = xen_restart, + .halt = xen_machine_halt, + .power_off = xen_machine_halt, + .shutdown = xen_machine_halt, + .crash_shutdown = xen_crash_shutdown, + .emergency_restart = xen_emergency_restart, +}; + + +/* First C function to be called on Xen boot */ +asmlinkage void __init xen_start_kernel(void) +{ + pgd_t *pgd; + + if (!xen_start_info) + return; + + BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); + + /* Install Xen paravirt ops */ + paravirt_ops = xen_paravirt_ops; + machine_ops = xen_machine_ops; + +#ifdef CONFIG_SMP + smp_ops = xen_smp_ops; +#endif + + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + + pgd = (pgd_t *)xen_start_info->pt_base; + + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + + init_mm.pgd = pgd; /* use the Xen pagetables to start */ + + /* keep using Xen gdt for now; no urgent need to change it */ + + x86_write_percpu(xen_cr3, __pa(pgd)); + +#ifdef CONFIG_SMP + /* Don't do the full vcpu_info placement stuff until we have a + possible map. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; +#else + /* May as well do it now, since there's no good time to call + it later on UP. */ + xen_setup_vcpu_info_placement(); +#endif + + paravirt_ops.kernel_rpl = 1; + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + paravirt_ops.kernel_rpl = 0; + + /* set the limit of our address space */ + reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + new_cpu_data.hard_math = 1; + new_cpu_data.x86_capability[0] = cpuid_edx(1); + + /* Poke various useful things into boot_params */ + LOADER_TYPE = (9 << 4) | 0; + INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; + INITRD_SIZE = xen_start_info->mod_len; + + /* Start the world */ + start_kernel(); +} diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c new file mode 100644 index 000000000000..da1b173547a1 --- /dev/null +++ b/arch/i386/xen/events.c @@ -0,0 +1,591 @@ +/* + * Xen event channels + * + * Xen models interrupts with abstract event channels. Because each + * domain gets 1024 event channels, but NR_IRQ is not that large, we + * must dynamically map irqs<->event channels. The event channels + * interface with the rest of the kernel by defining a xen interrupt + * chip. When an event is recieved, it is mapped to an irq and sent + * through the normal interrupt processing path. + * + * There are four kinds of events which can be mapped to an event + * channel: + * + * 1. Inter-domain notifications. This includes all the virtual + * device events, since they're driven by front-ends in another domain + * (typically dom0). + * 2. VIRQs, typically used for timers. These are per-cpu events. + * 3. IPIs. + * 4. Hardware interrupts. Not supported at present. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <asm/ptrace.h> +#include <asm/irq.h> +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "xen-ops.h" + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> VIRQ mapping. */ +static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping */ +static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; + +/* Packed IRQ information: binding type, sub-type index, and event channel. */ +struct packed_irq +{ + unsigned short evtchn; + unsigned char index; + unsigned char type; +}; + +static struct packed_irq irq_info[NR_IRQS]; + +/* Binding types. */ +enum { + IRQT_UNBOUND, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 +}; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} +EXPORT_SYMBOL_GPL(force_evtchn_callback); + +static struct irq_chip xen_dynamic_chip; + +/* Constructor for packed IRQ information. */ +static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + return (struct packed_irq) { evtchn, index, type }; +} + +/* + * Accessors for packed IRQ information. + */ +static inline unsigned int evtchn_from_irq(int irq) +{ + return irq_info[irq].evtchn; +} + +static inline unsigned int index_from_irq(int irq) +{ + return irq_info[irq].index; +} + +static inline unsigned int type_from_irq(int irq) +{ + return irq_info[irq].type; +} + +static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + int irq = evtchn_to_irq[chn]; + + BUG_ON(irq == -1); +#ifdef CONFIG_SMP + irq_desc[irq].affinity = cpumask_of_cpu(cpu); +#endif + + __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); + __set_bit(chn, cpu_evtchn_mask[cpu]); + + cpu_evtchn[chn] = cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ +#ifdef CONFIG_SMP + int i; + /* By default all event channels notify CPU#0. */ + for (i = 0; i < NR_IRQS; i++) + irq_desc[i].affinity = cpumask_of_cpu(0); +#endif + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +static inline void clear_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void set_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_pending[0]); +} + + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +static void mask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_mask[0]); +} + +static void unmask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu != cpu_from_evtchn(port))) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + + sync_clear_bit(port, &s->evtchn_mask[0]); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (sync_test_bit(port, &s->evtchn_pending[0]) && + !sync_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static int find_unbound_irq(void) +{ + int irq; + + /* Only allocate from dynirq range */ + for (irq = 0; irq < NR_IRQS; irq++) + if (irq_bindcount[irq] == 0) + break; + + if (irq == NR_IRQS) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +int bind_evtchn_to_irq(unsigned int evtchn) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + irq = evtchn_to_irq[evtchn]; + + if (irq == -1) { + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "event"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); + +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(ipi_to_irq, cpu)[ipi]; + if (irq == -1) { + irq = find_unbound_irq(); + if (irq < 0) + goto out; + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "ipi"); + + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + out: + spin_unlock(&irq_mapping_update_lock); + return irq; +} + + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(virq_to_irq, cpu)[virq]; + + if (irq == -1) { + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "virq"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn = evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = IRQ_UNBOUND; + + dynamic_irq_init(irq); + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, + const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_evtchn_to_irq(evtchn); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_virq_to_irq(virq, cpu); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +int bind_ipi_to_irqhandler(enum ipi_vector ipi, + unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_ipi_to_irq(ipi, cpu); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +{ + int irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); +} + + +/* + * Search the CPUs pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for + * handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + int cpu = get_cpu(); + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + unsigned long pending_words; + + vcpu_info->evtchn_upcall_pending = 0; + + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); + while (pending_words != 0) { + unsigned long pending_bits; + int word_idx = __ffs(pending_words); + pending_words &= ~(1UL << word_idx); + + while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; + + if (irq != -1) { + regs->orig_eax = ~irq; + do_IRQ(regs); + } + } + } + + put_cpu(); +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + int evtchn = evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port = evtchn; + bind_vcpu.vcpu = tcpu; + + /* + * If this fails, it usually just indicates that we're dealing with a + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + bind_evtchn_to_cpu(evtchn, tcpu); +} + + +static void set_affinity_irq(unsigned irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +static int retrigger_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + int ret = 0; + + if (VALID_EVTCHN(evtchn)) { + set_evtchn(evtchn); + ret = 1; + } + + return ret; +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", + .mask = disable_dynirq, + .unmask = enable_dynirq, + .ack = ack_dynirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, +}; + +void __init xen_init_IRQ(void) +{ + int i; + + init_evtchn_cpu_bindings(); + + /* No event channels are 'live' right now. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i = 0; i < NR_IRQS; i++) + irq_bindcount[i] = 0; + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c new file mode 100644 index 000000000000..0707714e40d6 --- /dev/null +++ b/arch/i386/xen/features.c @@ -0,0 +1,29 @@ +/****************************************************************************** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include <linux/types.h> +#include <linux/cache.h> +#include <linux/module.h> +#include <asm/xen/hypervisor.h> +#include <xen/features.h> + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +EXPORT_SYMBOL_GPL(xen_features); + +void xen_setup_features(void) +{ + struct xen_feature_info fi; + int i, j; + + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx = i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j = 0; j < 32; j++) + xen_features[i * 32 + j] = !!(fi.submap & 1<<j); + } +} diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c new file mode 100644 index 000000000000..aa7af9e6abc0 --- /dev/null +++ b/arch/i386/xen/manage.c @@ -0,0 +1,143 @@ +/* + * Handle extern requests for shutdown, reboot and sysrq + */ +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> + +#include <xen/xenbus.h> + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_SUSPEND 2 +/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only + * report a crash, not be instructed to crash! + * HALT is the same as POWEROFF, as far as we're concerned. The tools use + * the distinction when we return the reason code to them. + */ +#define SHUTDOWN_HALT 4 + +/* Ignore multiple shutdown requests. */ +static int shutting_down = SHUTDOWN_INVALID; + +static void shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int err; + + if (shutting_down != SHUTDOWN_INVALID) + return; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) { + kfree(str); + goto again; + } + + if (strcmp(str, "poweroff") == 0 || + strcmp(str, "halt") == 0) + orderly_poweroff(false); + else if (strcmp(str, "reboot") == 0) + ctrl_alt_del(); + else { + printk(KERN_INFO "Ignoring shutdown request: %s\n", str); + shutting_down = SHUTDOWN_INVALID; + } + + kfree(str); +} + +static void sysrq_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char sysrq_key = '\0'; + struct xenbus_transaction xbt; + int err; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + printk(KERN_ERR "Unable to read sysrq code in " + "control/sysrq\n"); + xenbus_transaction_end(xbt, 1); + return; + } + + if (sysrq_key != '\0') + xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + + if (sysrq_key != '\0') + handle_sysrq(sysrq_key, NULL); +} + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +static struct xenbus_watch sysrq_watch = { + .node = "control/sysrq", + .callback = sysrq_handler +}; + +static int setup_shutdown_watcher(void) +{ + int err; + + err = register_xenbus_watch(&shutdown_watch); + if (err) { + printk(KERN_ERR "Failed to set shutdown watcher\n"); + return err; + } + + err = register_xenbus_watch(&sysrq_watch); + if (err) { + printk(KERN_ERR "Failed to set sysrq watcher\n"); + return err; + } + + return 0; +} + +static int shutdown_event(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + setup_shutdown_watcher(); + return NOTIFY_DONE; +} + +static int __init setup_shutdown_event(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = shutdown_event + }; + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} + +subsys_initcall(setup_shutdown_event); diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c new file mode 100644 index 000000000000..4ae038aa6c24 --- /dev/null +++ b/arch/i386/xen/mmu.c @@ -0,0 +1,564 @@ +/* + * Xen mmu operations + * + * This file contains the various mmu fetch and update operations. + * The most important job they must perform is the mapping between the + * domain's pfn and the overall machine mfns. + * + * Xen allows guests to directly update the pagetable, in a controlled + * fashion. In other words, the guest modifies the same pagetable + * that the CPU actually uses, which eliminates the overhead of having + * a separate shadow pagetable. + * + * In order to allow this, it falls on the guest domain to map its + * notion of a "physical" pfn - which is just a domain-local linear + * address - into a real "machine address" which the CPU's MMU can + * use. + * + * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be + * inserted directly into the pagetable. When creating a new + * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, + * when reading the content back with __(pgd|pmd|pte)_val, it converts + * the mfn back into a pfn. + * + * The other constraint is that all pages which make up a pagetable + * must be mapped read-only in the guest. This prevents uncontrolled + * guest updates to the pagetable. Xen strictly enforces this, and + * will disallow any pagetable update which will end up mapping a + * pagetable page RW, and will disallow using any writable page as a + * pagetable. + * + * Naively, when loading %cr3 with the base of a new pagetable, Xen + * would need to validate the whole pagetable before going on. + * Naturally, this is quite slow. The solution is to "pin" a + * pagetable, which enforces all the constraints on the pagetable even + * when it is not actively in use. This menas that Xen can be assured + * that it is still valid when you do load it into %cr3, and doesn't + * need to revalidate it. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/bug.h> +#include <linux/sched.h> + +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/paravirt.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/page.h> +#include <xen/interface/xen.h> + +#include "multicalls.h" +#include "mmu.h" + +xmaddr_t arbitrary_virt_to_machine(unsigned long address) +{ + pte_t *pte = lookup_address(address); + unsigned offset = address & PAGE_MASK; + + BUG_ON(pte == NULL); + + return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); +} + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_wrprotect(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_mkwrite(*pte); + + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + struct multicall_space mcs; + struct mmu_update *u; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptr).maddr; + u->val = pmd_val_ma(val); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* <mfn,flags> stored as-is, to permit clearing entries */ + xen_set_pte(pte, mfn_pte(mfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + if (mm == current->mm || mm == &init_mm) { + if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + struct multicall_space mcs; + mcs = xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + xen_mc_issue(PARAVIRT_LAZY_MMU); + return; + } else + if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) + return; + } + xen_set_pte(ptep, pteval); +} + +#ifdef CONFIG_X86_PAE +void xen_set_pud(pud_t *ptr, pud_t val) +{ + struct multicall_space mcs; + struct mmu_update *u; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptr).maddr; + u->val = pud_val_ma(val); + MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((u64 *)ptep, pte_val_ma(pte)); +} + +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); /* make sure low gets written first */ + ptep->pte_high = 0; +} + +void xen_pmd_clear(pmd_t *pmdp) +{ + xen_set_pmd(pmdp, __pmd(0)); +} + +unsigned long long xen_pte_val(pte_t pte) +{ + unsigned long long ret = 0; + + if (pte.pte_low) { + ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + } + + return ret; +} + +unsigned long long xen_pmd_val(pmd_t pmd) +{ + unsigned long long ret = pmd.pmd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +unsigned long long xen_pgd_val(pgd_t pgd) +{ + unsigned long long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long long pte) +{ + if (pte & 1) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte, pte >> 32 }; +} + +pmd_t xen_make_pmd(unsigned long long pmd) +{ + if (pmd & 1) + pmd = phys_to_machine(XPADDR(pmd)).maddr; + + return (pmd_t){ pmd }; +} + +pgd_t xen_make_pgd(unsigned long long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#else /* !PAE */ +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + +unsigned long xen_pte_val(pte_t pte) +{ + unsigned long ret = pte.pte_low; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr; + + return ret; +} + +unsigned long xen_pgd_val(pgd_t pgd) +{ + unsigned long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long pte) +{ + if (pte & _PAGE_PRESENT) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte }; +} + +pgd_t xen_make_pgd(unsigned long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#endif /* CONFIG_X86_PAE */ + + + +/* + (Yet another) pagetable walker. This one is intended for pinning a + pagetable. This means that it walks a pagetable and calls the + callback function on each page it finds making up the page table, + at every level. It walks the entire pagetable, but it only bothers + pinning pte pages which are below pte_limit. In the normal case + this will be TASK_SIZE, but at boot we need to pin up to + FIXADDR_TOP. But the important bit is that we don't pin beyond + there, because then we start getting into Xen's ptes. +*/ +static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), + unsigned long limit) +{ + pgd_t *pgd = pgd_base; + int flush = 0; + unsigned long addr = 0; + unsigned long pgd_next; + + BUG_ON(limit > FIXADDR_TOP); + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 0; + + for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + pud_t *pud; + unsigned long pud_limit, pud_next; + + pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + + if (!pgd_val(*pgd)) + continue; + + pud = pud_offset(pgd, 0); + + if (PTRS_PER_PUD > 1) /* not folded */ + flush |= (*func)(virt_to_page(pud), 0); + + for (; addr != pud_limit; pud++, addr = pud_next) { + pmd_t *pmd; + unsigned long pmd_limit; + + pud_next = pud_addr_end(addr, pud_limit); + + if (pud_next < limit) + pmd_limit = pud_next; + else + pmd_limit = limit; + + if (pud_none(*pud)) + continue; + + pmd = pmd_offset(pud, 0); + + if (PTRS_PER_PMD > 1) /* not folded */ + flush |= (*func)(virt_to_page(pmd), 0); + + for (; addr != pmd_limit; pmd++) { + addr += (PAGE_SIZE * PTRS_PER_PTE); + if ((pmd_limit-1) < (addr-1)) { + addr = pmd_limit; + break; + } + + if (pmd_none(*pmd)) + continue; + + flush |= (*func)(pmd_page(*pmd), 0); + } + } + } + + flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); + + return flush; +} + +static int pin_page(struct page *page, unsigned flags) +{ + unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); + int flush; + + if (pgfl) + flush = 0; /* already pinned */ + else if (PageHighMem(page)) + /* kmaps need flushing if we found an unpinned + highpage */ + flush = 1; + else { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + + flush = 0; + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL_RO), + flags); + } + + return flush; +} + +/* This is called just after a mm has been created, but it has not + been used yet. We need to make sure that its pagetable is all + read-only, and can be pinned. */ +void xen_pgd_pin(pgd_t *pgd) +{ + struct multicall_space mcs; + struct mmuext_op *op; + + xen_mc_batch(); + + if (pgd_walk(pgd, pin_page, TASK_SIZE)) { + /* re-enable interrupts for kmap_flush_unused */ + xen_mc_issue(0); + kmap_flush_unused(); + xen_mc_batch(); + } + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + +#ifdef CONFIG_X86_PAE + op->cmd = MMUEXT_PIN_L3_TABLE; +#else + op->cmd = MMUEXT_PIN_L2_TABLE; +#endif + op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(0); +} + +/* The init_mm pagetable is really pinned as soon as its created, but + that's before we have page structures to store the bits. So do all + the book-keeping now. */ +static __init int mark_pinned(struct page *page, unsigned flags) +{ + SetPagePinned(page); + return 0; +} + +void __init xen_mark_init_mm_pinned(void) +{ + pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); +} + +static int unpin_page(struct page *page, unsigned flags) +{ + unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); + + if (pgfl && !PageHighMem(page)) { + void *pt = lowmem_page_address(page); + unsigned long pfn = page_to_pfn(page); + struct multicall_space mcs = __xen_mc_entry(0); + + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, + pfn_pte(pfn, PAGE_KERNEL), + flags); + } + + return 0; /* never need to flush on unpin */ +} + +/* Release a pagetables pages back as normal RW */ +static void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + xen_mc_batch(); + + mcs = __xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_UNPIN_TABLE; + op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + pgd_walk(pgd, unpin_page, TASK_SIZE); + + xen_mc_issue(0); +} + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + spin_lock(&next->page_table_lock); + xen_pgd_pin(next->pgd); + spin_unlock(&next->page_table_lock); +} + +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + xen_pgd_pin(mm->pgd); + spin_unlock(&mm->page_table_lock); +} + + +#ifdef CONFIG_SMP +/* Another cpu may still have their %cr3 pointing at the pagetable, so + we need to repoint it somewhere else before we can unpin it. */ +static void drop_other_mm_ref(void *info) +{ + struct mm_struct *mm = info; + + if (__get_cpu_var(cpu_tlbstate).active_mm == mm) + leave_mm(smp_processor_id()); +} + +static void drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) { + if (current->mm == mm) + load_cr3(swapper_pg_dir); + else + leave_mm(smp_processor_id()); + } + + if (!cpus_empty(mm->cpu_vm_mask)) + xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, + mm, 1); +} +#else +static void drop_mm_ref(struct mm_struct *mm) +{ + if (current->active_mm == mm) + load_cr3(swapper_pg_dir); +} +#endif + +/* + * While a process runs, Xen pins its pagetables, which means that the + * hypervisor forces it to be read-only, and it controls all updates + * to it. This means that all pagetable updates have to go via the + * hypervisor, which is moderately expensive. + * + * Since we're pulling the pagetable down, we switch to use init_mm, + * unpin old process pagetable and mark it all read-write, which + * allows further operations on it to be simple memory accesses. + * + * The only subtle point is that another CPU may be still using the + * pagetable because of lazy tlb flushing. This means we need need to + * switch all CPUs off this pagetable before we can unpin it. + */ +void xen_exit_mmap(struct mm_struct *mm) +{ + get_cpu(); /* make sure we don't move around */ + drop_mm_ref(mm); + put_cpu(); + + spin_lock(&mm->page_table_lock); + xen_pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); +} diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h new file mode 100644 index 000000000000..c9ff27f3ac3a --- /dev/null +++ b/arch/i386/xen/mmu.h @@ -0,0 +1,60 @@ +#ifndef _XEN_MMU_H + +#include <linux/linkage.h> +#include <asm/page.h> + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + * + * Note that Xen is using the fact that the pagetable base is always + * page-aligned, and putting the 12 MSB of the address into the 12 LSB + * of cr3. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + + +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +void xen_set_pte(pte_t *ptep, pte_t pteval); +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); +void xen_exit_mmap(struct mm_struct *mm); + +void xen_pgd_pin(pgd_t *pgd); +//void xen_pgd_unpin(pgd_t *pgd); + +#ifdef CONFIG_X86_PAE +unsigned long long xen_pte_val(pte_t); +unsigned long long xen_pmd_val(pmd_t); +unsigned long long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long long); +pmd_t xen_make_pmd(unsigned long long); +pgd_t xen_make_pgd(unsigned long long); + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_set_pud(pud_t *ptr, pud_t val); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); + + +#else +unsigned long xen_pte_val(pte_t); +unsigned long xen_pmd_val(pmd_t); +unsigned long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long); +pmd_t xen_make_pmd(unsigned long); +pgd_t xen_make_pgd(unsigned long); +#endif + +#endif /* _XEN_MMU_H */ diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c new file mode 100644 index 000000000000..c837e8e463db --- /dev/null +++ b/arch/i386/xen/multicalls.c @@ -0,0 +1,90 @@ +/* + * Xen hypercall batching. + * + * Xen allows multiple hypercalls to be issued at once, using the + * multicall interface. This allows the cost of trapping into the + * hypervisor to be amortized over several calls. + * + * This file implements a simple interface for multicalls. There's a + * per-cpu buffer of outstanding multicalls. When you want to queue a + * multicall for issuing, you can allocate a multicall slot for the + * call and its arguments, along with storage for space which is + * pointed to by the arguments (for passing pointers to structures, + * etc). When the multicall is actually issued, all the space for the + * commands and allocated memory is freed for reuse. + * + * Multicalls are flushed whenever any of the buffers get full, or + * when explicitly requested. There's no way to get per-multicall + * return results back. It will BUG if any of the multicalls fail. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/percpu.h> +#include <linux/hardirq.h> + +#include <asm/xen/hypercall.h> + +#include "multicalls.h" + +#define MC_BATCH 32 +#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) + +struct mc_buffer { + struct multicall_entry entries[MC_BATCH]; + u64 args[MC_ARGS]; + unsigned mcidx, argidx; +}; + +static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); +DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); + +void xen_mc_flush(void) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + int ret = 0; + unsigned long flags; + + BUG_ON(preemptible()); + + /* Disable interrupts in case someone comes in and queues + something in the middle */ + local_irq_save(flags); + + if (b->mcidx) { + int i; + + if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) + BUG(); + for (i = 0; i < b->mcidx; i++) + if (b->entries[i].result < 0) + ret++; + b->mcidx = 0; + b->argidx = 0; + } else + BUG_ON(b->argidx != 0); + + local_irq_restore(flags); + + BUG_ON(ret); +} + +struct multicall_space __xen_mc_entry(size_t args) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct multicall_space ret; + unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + + BUG_ON(preemptible()); + BUG_ON(argspace > MC_ARGS); + + if (b->mcidx == MC_BATCH || + (b->argidx + argspace) > MC_ARGS) + xen_mc_flush(); + + ret.mc = &b->entries[b->mcidx]; + b->mcidx++; + ret.args = &b->args[b->argidx]; + b->argidx += argspace; + + return ret; +} diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h new file mode 100644 index 000000000000..e6f7530b156c --- /dev/null +++ b/arch/i386/xen/multicalls.h @@ -0,0 +1,45 @@ +#ifndef _XEN_MULTICALLS_H +#define _XEN_MULTICALLS_H + +#include "xen-ops.h" + +/* Multicalls */ +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +/* Allocate room for a multicall and its args */ +struct multicall_space __xen_mc_entry(size_t args); + +DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); + +/* Call to start a batch of multiple __xen_mc_entry()s. Must be + paired with xen_mc_issue() */ +static inline void xen_mc_batch(void) +{ + /* need to disable interrupts until this entry is complete */ + local_irq_save(__get_cpu_var(xen_mc_irq_flags)); +} + +static inline struct multicall_space xen_mc_entry(size_t args) +{ + xen_mc_batch(); + return __xen_mc_entry(args); +} + +/* Flush all pending multicalls */ +void xen_mc_flush(void); + +/* Issue a multicall if we're not in a lazy mode */ +static inline void xen_mc_issue(unsigned mode) +{ + if ((xen_get_lazy_mode() & mode) == 0) + xen_mc_flush(); + + /* restore flags saved in xen_mc_batch */ + local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); +} + +#endif /* _XEN_MULTICALLS_H */ diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c new file mode 100644 index 000000000000..f84e77226646 --- /dev/null +++ b/arch/i386/xen/setup.c @@ -0,0 +1,111 @@ +/* + * Machine specific setup for xen + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/pm.h> + +#include <asm/elf.h> +#include <asm/e820.h> +#include <asm/setup.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/interface/physdev.h> +#include <xen/features.h> + +#include "xen-ops.h" +#include "vdso.h" + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +unsigned long *phys_to_machine_mapping; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ + +char * __init xen_memory_setup(void) +{ + unsigned long max_pfn = xen_start_info->nr_pages; + + e820.nr_map = 0; + add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); + + return "Xen"; +} + +static void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |= TS_POLLING; + } +} + +/* + * Set the bit indicating "nosegneg" library variants should be used. + */ +static void fiddle_vdso(void) +{ + extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ + extern char vsyscall_int80_start; + u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK + + &vsyscall_int80_start); + *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; +} + +void __init xen_arch_setup(void) +{ + struct physdev_set_iopl set_iopl; + int rc; + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + + HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, + __KERNEL_CS, (unsigned long)xen_failsafe_callback); + + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + printk(KERN_INFO "physdev_op failed %d\n", rc); + +#ifdef CONFIG_ACPI + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + disable_acpi(); + } +#endif + + memcpy(boot_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + + pm_idle = xen_idle; + +#ifdef CONFIG_SMP + /* fill cpus_possible with all available cpus */ + xen_fill_possible_map(); +#endif + + paravirt_disable_iospace(); + + fiddle_vdso(); +} diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c new file mode 100644 index 000000000000..557b8e24706a --- /dev/null +++ b/arch/i386/xen/smp.c @@ -0,0 +1,404 @@ +/* + * Xen SMP support + * + * This file implements the Xen versions of smp_ops. SMP under Xen is + * very straightforward. Bringing a CPU up is simply a matter of + * loading its initial context and setting it running. + * + * IPIs are handled through the Xen event mechanism. + * + * Because virtual CPUs can be scheduled onto any real CPU, there's no + * useful topology information for the kernel to make use of. As a + * result, all CPUs are treated as if they're single-core and + * single-threaded. + * + * This does not handle HOTPLUG_CPU yet. + */ +#include <linux/sched.h> +#include <linux/err.h> +#include <linux/smp.h> + +#include <asm/paravirt.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/cpu.h> + +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> + +#include <asm/xen/interface.h> +#include <asm/xen/hypercall.h> + +#include <xen/page.h> +#include <xen/events.h> + +#include "xen-ops.h" +#include "mmu.h" + +static cpumask_t cpu_initialized_map; +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); + +/* + * Structure and data for smp_call_function(). This is designed to minimise + * static memory requirements. It also looks cleaner. + */ +static DEFINE_SPINLOCK(call_lock); + +struct call_data_struct { + void (*func) (void *info); + void *info; + atomic_t started; + atomic_t finished; + int wait; +}; + +static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); + +static struct call_data_struct *call_data; + +/* + * Reschedule call back. Nothing to do, + * all the work is done automatically when + * we return from the interrupt. + */ +static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) +{ + return IRQ_HANDLED; +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + int cpu = smp_processor_id(); + + cpu_init(); + + preempt_disable(); + per_cpu(cpu_state, cpu) = CPU_ONLINE; + + xen_setup_cpu_clockevents(); + + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); + + wmb(); /* make sure everything is out */ + cpu_idle(); +} + +static int xen_smp_intr_init(unsigned int cpu) +{ + int rc; + const char *resched_name, *callfunc_name; + + per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + + resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, + cpu, + xen_reschedule_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + resched_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(resched_irq, cpu) = rc; + + callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, + cpu, + xen_call_function_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(callfunc_irq, cpu) = rc; + + return 0; + + fail: + if (per_cpu(resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + if (per_cpu(callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + return rc; +} + +void __init xen_fill_possible_map(void) +{ + int i, rc; + + for (i = 0; i < NR_CPUS; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) + cpu_set(i, cpu_possible_map); + } +} + +void __init xen_smp_prepare_boot_cpu(void) +{ + int cpu; + + BUG_ON(smp_processor_id() != 0); + native_smp_prepare_boot_cpu(); + + /* We've switched to the "real" per-cpu gdt, so make sure the + old memory can be recycled */ + make_lowmem_page_readwrite(&per_cpu__gdt_page); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } + + xen_setup_vcpu_info_placement(); +} + +void __init xen_smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + } + + smp_store_cpu_info(0); + set_cpu_sibling_map(0); + + if (xen_smp_intr_init(0)) + BUG(); + + cpu_initialized_map = cpumask_of_cpu(0); + + /* Restrict the possible_map according to max_cpus. */ + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { + for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) + continue; + cpu_clear(cpu, cpu_possible_map); + } + + for_each_possible_cpu (cpu) { + struct task_struct *idle; + + if (cpu == 0) + continue; + + idle = fork_idle(cpu); + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + + cpu_set(cpu, cpu_present_map); + } + + //init_xenbus_allowed_cpumask(); +} + +static __cpuinit int +cpu_initialize_context(unsigned int cpu, struct task_struct *idle) +{ + struct vcpu_guest_context *ctxt; + struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + + if (cpu_test_and_set(cpu, cpu_initialized_map)) + return 0; + + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); + if (ctxt == NULL) + return -ENOMEM; + + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.fs = __KERNEL_PERCPU; + ctxt->user_regs.gs = 0; + ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); + + xen_copy_trap_info(ctxt->trap_ctxt); + + ctxt->ldt_ents = 0; + + BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); + make_lowmem_page_readonly(gdt->gdt); + + ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); + ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + + ctxt->user_regs.cs = __KERNEL_CS; + ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.esp0; + + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; + + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); + + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) + BUG(); + + kfree(ctxt); + return 0; +} + +int __cpuinit xen_cpu_up(unsigned int cpu) +{ + struct task_struct *idle = idle_task(cpu); + int rc; + +#if 0 + rc = cpu_up_check(cpu); + if (rc) + return rc; +#endif + + init_gdt(cpu); + per_cpu(current_task, cpu) = idle; + irq_ctx_init(cpu); + xen_setup_timer(cpu); + + /* make sure interrupts start blocked */ + per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; + + rc = cpu_initialize_context(cpu, idle); + if (rc) + return rc; + + if (num_online_cpus() == 1) + alternatives_smp_switch(1); + + rc = xen_smp_intr_init(cpu); + if (rc) + return rc; + + smp_store_cpu_info(cpu); + set_cpu_sibling_map(cpu); + /* This must be done before setting cpu_online_map */ + wmb(); + + cpu_set(cpu, cpu_online_map); + + rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); + BUG_ON(rc); + + return 0; +} + +void xen_smp_cpus_done(unsigned int max_cpus) +{ +} + +static void stop_self(void *v) +{ + int cpu = smp_processor_id(); + + /* make sure we're not pinning something down */ + load_cr3(swapper_pg_dir); + /* should set up a minimal gdt */ + + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); + BUG(); +} + +void xen_smp_send_stop(void) +{ + smp_call_function(stop_self, NULL, 0, 0); +} + +void xen_smp_send_reschedule(int cpu) +{ + xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); +} + + +static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) +{ + unsigned cpu; + + cpus_and(mask, mask, cpu_online_map); + + for_each_cpu_mask(cpu, mask) + xen_send_IPI_one(cpu, vector); +} + +static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) +{ + void (*func) (void *info) = call_data->func; + void *info = call_data->info; + int wait = call_data->wait; + + /* + * Notify initiating CPU that I've grabbed the data and am + * about to execute the function + */ + mb(); + atomic_inc(&call_data->started); + /* + * At this point the info structure may be out of scope unless wait==1 + */ + irq_enter(); + (*func)(info); + irq_exit(); + + if (wait) { + mb(); /* commit everything before setting finished */ + atomic_inc(&call_data->finished); + } + + return IRQ_HANDLED; +} + +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait) +{ + struct call_data_struct data; + int cpus; + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + + cpu_clear(smp_processor_id(), mask); + + cpus = cpus_weight(mask); + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + mb(); /* write everything before IPI */ + + /* Send a message to other CPUs and wait for them to respond */ + xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + + /* Make sure other vcpus get a chance to run. + XXX too severe? Maybe we should check the other CPU's states? */ + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus || + (wait && atomic_read(&data.finished) != cpus)) + cpu_relax(); + + spin_unlock(&call_lock); + + return 0; +} diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c new file mode 100644 index 000000000000..dfd6db69ead5 --- /dev/null +++ b/arch/i386/xen/time.c @@ -0,0 +1,593 @@ +/* + * Xen time implementation. + * + * This is implemented in terms of a clocksource driver which uses + * the hypervisor clock as a nanosecond timebase, and a clockevent + * driver which uses the hypervisor's timer mechanism. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/kernel_stat.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> + +#include "xen-ops.h" + +#define XEN_SHIFT 22 + +/* Xen may fire a timer up to this many ns early */ +#define TIMER_SLOP 100000 +#define NS_PER_TICK (1000000000LL / HZ) + +static cycle_t xen_clocksource_read(void); + +/* These are perodically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + +/* runstate info updated by Xen */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* snapshots of runstate info */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); + +/* unused ns of stolen and blocked time */ +static DEFINE_PER_CPU(u64, residual_stolen); +static DEFINE_PER_CPU(u64, residual_blocked); + +/* return an consistent snapshot of 64-bit time/counter value */ +static u64 get64(const u64 *p) +{ + u64 ret; + + if (BITS_PER_LONG < 64) { + u32 *p32 = (u32 *)p; + u32 h, l; + + /* + * Read high then low, and then make sure high is + * still the same; this will only loop if low wraps + * and carries into high. + * XXX some clean way to make this endian-proof? + */ + do { + h = p32[1]; + barrier(); + l = p32[0]; + barrier(); + } while (p32[1] != h); + + ret = (((u64)h) << 32) | l; + } else + ret = *p; + + return ret; +} + +/* + * Runstate accounting + */ +static void get_runstate_snapshot(struct vcpu_runstate_info *res) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + BUG_ON(preemptible()); + + state = &__get_cpu_var(runstate); + + /* + * The runstate info is always updated by the hypervisor on + * the current CPU, so there's no need to use anything + * stronger than a compiler barrier when fetching it. + */ + do { + state_time = get64(&state->state_entry_time); + barrier(); + *res = *state; + barrier(); + } while (get64(&state->state_entry_time) != state_time); +} + +static void setup_runstate_info(int cpu) +{ + struct vcpu_register_runstate_memory_area area; + + area.addr.v = &per_cpu(runstate, cpu); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, + cpu, &area)) + BUG(); +} + +static void do_stolen_accounting(void) +{ + struct vcpu_runstate_info state; + struct vcpu_runstate_info *snap; + s64 blocked, runnable, offline, stolen; + cputime_t ticks; + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + snap = &__get_cpu_var(runstate_snapshot); + + /* work out how much time the VCPU has not been runn*ing* */ + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; + runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; + offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; + + *snap = state; + + /* Add the appropriate number of ticks of stolen time, + including any left-overs from last time. Passing NULL to + account_steal_time accounts the time as stolen. */ + stolen = runnable + offline + __get_cpu_var(residual_stolen); + + if (stolen < 0) + stolen = 0; + + ticks = 0; + while (stolen >= NS_PER_TICK) { + ticks++; + stolen -= NS_PER_TICK; + } + __get_cpu_var(residual_stolen) = stolen; + account_steal_time(NULL, ticks); + + /* Add the appropriate number of ticks of blocked time, + including any left-overs from last time. Passing idle to + account_steal_time accounts the time as idle/wait. */ + blocked += __get_cpu_var(residual_blocked); + + if (blocked < 0) + blocked = 0; + + ticks = 0; + while (blocked >= NS_PER_TICK) { + ticks++; + blocked -= NS_PER_TICK; + } + __get_cpu_var(residual_blocked) = blocked; + account_steal_time(idle_task(smp_processor_id()), ticks); +} + +/* + * Xen sched_clock implementation. Returns the number of unstolen + * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED + * states. + */ +unsigned long long xen_sched_clock(void) +{ + struct vcpu_runstate_info state; + cycle_t now; + u64 ret; + s64 offset; + + /* + * Ideally sched_clock should be called on a per-cpu basis + * anyway, so preempt should already be disabled, but that's + * not current practice at the moment. + */ + preempt_disable(); + + now = xen_clocksource_read(); + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + offset = now - state.state_entry_time; + if (offset < 0) + offset = 0; + + ret = state.time[RUNSTATE_blocked] + + state.time[RUNSTATE_running] + + offset; + + preempt_enable(); + + return ret; +} + + +/* Get the CPU speed from Xen */ +unsigned long xen_cpu_khz(void) +{ + u64 cpu_khz = 1000000ULL << 32; + const struct vcpu_time_info *info = + &HYPERVISOR_shared_info->vcpu_info[0].time; + + do_div(cpu_khz, info->tsc_to_system_mul); + if (info->tsc_shift < 0) + cpu_khz <<= -info->tsc_shift; + else + cpu_khz >>= info->tsc_shift; + + return cpu_khz; +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. + */ +static unsigned get_time_values_from_xen(void) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + /* src is shared memory with the hypervisor, so we need to + make sure we get a consistent snapshot, even in the face of + being preempted. */ + src = &__get_cpu_var(xen_vcpu)->time; + dst = &__get_cpu_var(shadow_time); + + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) | (dst->version ^ src->version)); + + return dst->version; +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + now = native_read_tsc(); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +static cycle_t xen_clocksource_read(void) +{ + struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + cycle_t ret; + unsigned version; + + do { + version = get_time_values_from_xen(); + barrier(); + ret = shadow->system_timestamp + get_nsec_offset(shadow); + barrier(); + } while (version != __get_cpu_var(xen_vcpu)->time.version); + + put_cpu_var(shadow_time); + + return ret; +} + +static void xen_read_wallclock(struct timespec *ts) +{ + const struct shared_info *s = HYPERVISOR_shared_info; + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = s->wc_version; + rmb(); /* fetch version before time */ + now.tv_sec = s->wc_sec; + now.tv_nsec = s->wc_nsec; + rmb(); /* fetch time before checking version */ + } while ((s->wc_version & 1) | (version ^ s->wc_version)); + + delta = xen_clocksource_read(); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +unsigned long xen_get_wallclock(void) +{ + struct timespec ts; + + xen_read_wallclock(&ts); + + return ts.tv_sec; +} + +int xen_set_wallclock(unsigned long now) +{ + /* do nothing for domU */ + return -1; +} + +static struct clocksource xen_clocksource __read_mostly = { + .name = "xen", + .rating = 400, + .read = xen_clocksource_read, + .mask = ~0, + .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ + .shift = XEN_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +/* + Xen clockevent implementation + + Xen has two clockevent implementations: + + The old timer_op one works with all released versions of Xen prior + to version 3.0.4. This version of the hypervisor provides a + single-shot timer with nanosecond resolution. However, sharing the + same event channel is a 100Hz tick which is delivered while the + vcpu is running. We don't care about or use this tick, but it will + cause the core time code to think the timer fired too soon, and + will end up resetting it each time. It could be filtered, but + doing so has complications when the ktime clocksource is not yet + the xen clocksource (ie, at boot time). + + The new vcpu_op-based timer interface allows the tick timer period + to be changed or turned off. The tick timer is not useful as a + periodic timer because events are only delivered to running vcpus. + The one-shot timer can report when a timeout is in the past, so + set_next_event is capable of returning -ETIME when appropriate. + This interface is used when available. +*/ + + +/* + Get a hypervisor absolute time. In theory we could maintain an + offset between the kernel's time and the hypervisor's time, and + apply that to a kernel's absolute timeout. Unfortunately the + hypervisor and kernel times can drift even if the kernel is using + the Xen clocksource, because ntp can warp the kernel's clocksource. +*/ +static s64 get_abs_timeout(unsigned long delta) +{ + return xen_clocksource_read() + delta; +} + +static void xen_timerop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* unsupported */ + WARN_ON(1); + break; + + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_RESUME: + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + HYPERVISOR_set_timer_op(0); /* cancel timeout */ + break; + } +} + +static int xen_timerop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) + BUG(); + + /* We may have missed the deadline, but there's no real way of + knowing for sure. If the event was in the past, then we'll + get an immediate interrupt. */ + + return 0; +} + +static const struct clock_event_device xen_timerop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_timerop_set_mode, + .set_next_event = xen_timerop_set_next_event, +}; + + + +static void xen_vcpuop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + WARN_ON(1); /* unsupported */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + case CLOCK_EVT_MODE_RESUME: + break; + } +} + +static int xen_vcpuop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + struct vcpu_set_singleshot_timer single; + int ret; + + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + single.timeout_abs_ns = get_abs_timeout(delta); + single.flags = VCPU_SSHOTTMR_future; + + ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); + + BUG_ON(ret != 0 && ret != -ETIME); + + return ret; +} + +static const struct clock_event_device xen_vcpuop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_vcpuop_set_mode, + .set_next_event = xen_vcpuop_set_next_event, +}; + +static const struct clock_event_device *xen_clockevent = + &xen_timerop_clockevent; +static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); + +static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); + irqreturn_t ret; + + ret = IRQ_NONE; + if (evt->event_handler) { + evt->event_handler(evt); + ret = IRQ_HANDLED; + } + + do_stolen_accounting(); + + return ret; +} + +void xen_setup_timer(int cpu) +{ + const char *name; + struct clock_event_device *evt; + int irq; + + printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); + + name = kasprintf(GFP_KERNEL, "timer%d", cpu); + if (!name) + name = "<timer kasprintf failed>"; + + irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, NULL); + + evt = &per_cpu(xen_clock_events, cpu); + memcpy(evt, xen_clockevent, sizeof(*evt)); + + evt->cpumask = cpumask_of_cpu(cpu); + evt->irq = irq; + + setup_runstate_info(cpu); +} + +void xen_setup_cpu_clockevents(void) +{ + BUG_ON(preemptible()); + + clockevents_register_device(&__get_cpu_var(xen_clock_events)); +} + +__init void xen_time_init(void) +{ + int cpu = smp_processor_id(); + + get_time_values_from_xen(); + + clocksource_register(&xen_clocksource); + + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { + /* Successfully turned off 100Hz tick, so we have the + vcpuop-based timer interface */ + printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); + xen_clockevent = &xen_vcpuop_clockevent; + } + + /* Set initial system time with full resolution */ + xen_read_wallclock(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + tsc_disable = 0; + + xen_setup_timer(cpu); + xen_setup_cpu_clockevents(); +} diff --git a/arch/i386/xen/vdso.h b/arch/i386/xen/vdso.h new file mode 100644 index 000000000000..861fedfe5230 --- /dev/null +++ b/arch/i386/xen/vdso.h @@ -0,0 +1,4 @@ +/* Bit used for the pseudo-hwcap for non-negative segments. We use + bit 1 to avoid bugs in some versions of glibc when bit 0 is + used; the choice is otherwise arbitrary. */ +#define VDSO_NOTE_NONEGSEG_BIT 1 diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S new file mode 100644 index 000000000000..1a43b60c0c62 --- /dev/null +++ b/arch/i386/xen/xen-asm.S @@ -0,0 +1,291 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include <linux/linkage.h> + +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/percpu.h> +#include <asm/processor-flags.h> +#include <asm/segment.h> + +#include <xen/interface/xen.h> + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Clear mask and test pending */ + andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + jz 1f +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + +/* + This is run where a normal iret would be run, with the same stack setup: + 8: eflags + 4: cs + esp-> 0: eip + + This attempts to make sure that any pending events are dealt + with on return to usermode, but there is a small window in + which an event can happen just before entering usermode. If + the nested interrupt ends up setting one of the TIF_WORK_MASK + pending work flags, they will not be tested again before + returning to usermode. This means that a process can end up + with pending work, which will be unprocessed until the process + enters and leaves the kernel again, which could be an + unbounded amount of time. This means that a pending signal or + reschedule event could be indefinitely delayed. + + The fix is to notice a nested interrupt in the critical + window, and if one occurs, then fold the nested interrupt into + the current interrupt stack frame, and re-process it + iteratively rather than recursively. This means that it will + exit via the normal path, and all pending work will be dealt + with appropriately. + + Because the nested interrupt handler needs to deal with the + current stack state in whatever form its in, we keep things + simple by only using a single register which is pushed/popped + on the stack. + + Non-direct iret could be done in the same way, but it would + require an annoying amount of code duplication. We'll assume + that direct mode will be the common case once the hypervisor + support becomes commonplace. + */ +ENTRY(xen_iret_direct) + /* test eflags for special cases */ + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) + jnz hyper_iret + + push %eax + ESP_OFFSET=4 # bytes pushed onto stack + + /* Store vcpu_info pointer for easy access. Do it this + way to avoid having to reload %fs */ +#ifdef CONFIG_SMP + GET_THREAD_INFO(%eax) + movl TI_cpu(%eax),%eax + movl __per_cpu_offset(,%eax,4),%eax + lea per_cpu__xen_vcpu_info(%eax),%eax +#else + movl $per_cpu__xen_vcpu_info, %eax +#endif + + /* check IF state we're restoring */ + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) + + /* Maybe enable events. Once this happens we could get a + recursive event, so the critical region starts immediately + afterwards. However, if that happens we don't end up + resuming the code, so we don't have to be worried about + being preempted to another CPU. */ + setz XEN_vcpu_info_mask(%eax) +xen_iret_start_crit: + + /* check for unmasked and pending */ + cmpw $0x0001, XEN_vcpu_info_pending(%eax) + + /* If there's something pending, mask events again so we + can jump back into xen_hypervisor_callback */ + sete XEN_vcpu_info_mask(%eax) + + popl %eax + + /* From this point on the registers are restored and the stack + updated, so we don't need to worry about it if we're preempted */ +iret_restore_end: + + /* Jump to hypervisor_callback after fixing up the stack. + Events are masked, so jumping out of the critical + region is OK. */ + je xen_hypervisor_callback + + iret +xen_iret_end_crit: + +hyper_iret: + /* put this out of line since its very rarely used */ + jmp hypercall_page + __HYPERVISOR_iret * 32 + + .globl xen_iret_start_crit, xen_iret_end_crit + +/* + This is called by xen_hypervisor_callback in entry.S when it sees + that the EIP at the time of interrupt was between xen_iret_start_crit + and xen_iret_end_crit. We're passed the EIP in %eax so we can do + a more refined determination of what to do. + + The stack format at this point is: + ---------------- + ss : (ss/esp may be present if we came from usermode) + esp : + eflags } outer exception info + cs } + eip } + ---------------- <- edi (copy dest) + eax : outer eax if it hasn't been restored + ---------------- + eflags } nested exception info + cs } (no ss/esp because we're nested + eip } from the same ring) + orig_eax }<- esi (copy src) + - - - - - - - - + fs } + es } + ds } SAVE_ALL state + eax } + : : + ebx } + ---------------- + return addr <- esp + ---------------- + + In order to deliver the nested exception properly, we need to shift + everything from the return addr up to the error code so it + sits just under the outer exception info. This means that when we + handle the exception, we do it in the context of the outer exception + rather than starting a new one. + + The only caveat is that if the outer eax hasn't been + restored yet (ie, it's still on stack), we need to insert + its value into the SAVE_ALL state before going on, since + it's usermode state which we eventually need to restore. + */ +ENTRY(xen_iret_crit_fixup) + /* offsets +4 for return address */ + + /* + Paranoia: Make sure we're really coming from userspace. + One could imagine a case where userspace jumps into the + critical range address, but just before the CPU delivers a GP, + it decides to deliver an interrupt instead. Unlikely? + Definitely. Easy to avoid? Yes. The Intel documents + explicitly say that the reported EIP for a bad jump is the + jump instruction itself, not the destination, but some virtual + environments get this wrong. + */ + movl PT_CS+4(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx + cmpl $USER_RPL, %ecx + je 2f + + lea PT_ORIG_EAX+4(%esp), %esi + lea PT_EFLAGS+4(%esp), %edi + + /* If eip is before iret_restore_end then stack + hasn't been restored yet. */ + cmp $iret_restore_end, %eax + jae 1f + + movl 0+4(%edi),%eax /* copy EAX */ + movl %eax, PT_EAX+4(%esp) + + lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ + + /* set up the copy */ +1: std + mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ + rep movsl + cld + + lea 4(%edi),%esp /* point esp to new frame */ +2: ret + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %eax + push %ecx + push %edx + call force_evtchn_callback + pop %edx + pop %ecx + pop %eax + ret diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S new file mode 100644 index 000000000000..bc71f3bc4014 --- /dev/null +++ b/arch/i386/xen/xen-head.S @@ -0,0 +1,38 @@ +/* Xen-specific pieces of head.S, intended to be included in the right + place in head.S */ + +#ifdef CONFIG_XEN + +#include <linux/elfnote.h> +#include <asm/boot.h> +#include <xen/interface/elfnote.h> + + .section .init.text +ENTRY(startup_xen) + movl %esi,xen_start_info + cld + movl $(init_thread_union+THREAD_SIZE),%esp + jmp xen_start_kernel + +.pushsection ".bss.page_aligned" + .align PAGE_SIZE_asm +ENTRY(hypercall_page) + .skip 0x1000 +.popsection + + .section .text + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + +#endif /*CONFIG_XEN */ diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h new file mode 100644 index 000000000000..b9aaea45f07f --- /dev/null +++ b/arch/i386/xen/xen-ops.h @@ -0,0 +1,71 @@ +#ifndef XEN_OPS_H +#define XEN_OPS_H + +#include <linux/init.h> + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +void xen_copy_trap_info(struct trap_info *traps); + +DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); +DECLARE_PER_CPU(unsigned long, xen_cr3); + +extern struct start_info *xen_start_info; +extern struct shared_info *HYPERVISOR_shared_info; + +char * __init xen_memory_setup(void); +void __init xen_arch_setup(void); +void __init xen_init_IRQ(void); + +void xen_setup_timer(int cpu); +void xen_setup_cpu_clockevents(void); +unsigned long xen_cpu_khz(void); +void __init xen_time_init(void); +unsigned long xen_get_wallclock(void); +int xen_set_wallclock(unsigned long time); +unsigned long long xen_sched_clock(void); + +void xen_mark_init_mm_pinned(void); + +DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +static inline unsigned xen_get_lazy_mode(void) +{ + return x86_read_percpu(xen_lazy_mode); +} + +void __init xen_fill_possible_map(void); + +void __init xen_setup_vcpu_info_placement(void); +void xen_smp_prepare_boot_cpu(void); +void xen_smp_prepare_cpus(unsigned int max_cpus); +int xen_cpu_up(unsigned int cpu); +void xen_smp_cpus_done(unsigned int max_cpus); + +void xen_smp_send_stop(void); +void xen_smp_send_reschedule(int cpu); +int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait); +int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait); + +int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait); + + +/* Declare an asm function, along with symbols needed to make it + inlineable */ +#define DECL_ASM(ret, name, ...) \ + ret name(__VA_ARGS__); \ + extern char name##_end[]; \ + extern char name##_reloc[] \ + +DECL_ASM(void, xen_irq_enable_direct, void); +DECL_ASM(void, xen_irq_disable_direct, void); +DECL_ASM(unsigned long, xen_save_fl_direct, void); +DECL_ASM(void, xen_restore_fl_direct, unsigned long); + +void xen_iret_direct(void); +#endif /* XEN_OPS_H */ |