From 265baba316ea258ca015aa79bc6f107cd9fce2b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Update defconfig Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/defconfig | 109 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 31 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 5fb970715941..647610ecb580 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.18-rc4 -# Thu Aug 24 21:05:55 2006 +# Linux kernel version: 2.6.18-git5 +# Tue Sep 26 09:30:47 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -19,6 +19,7 @@ CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_DMI=y +CONFIG_AUDIT_ARCH=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # @@ -38,16 +39,16 @@ CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set -CONFIG_SYSCTL=y # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set # CONFIG_RELAY is not set CONFIG_INITRAMFS_SOURCE="" -CONFIG_UID16=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_EMBEDDED is not set +CONFIG_UID16=y +CONFIG_SYSCTL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set @@ -56,12 +57,12 @@ CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_BASE_FULL=y -CONFIG_RT_MUTEXES=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y CONFIG_SLAB=y CONFIG_VM_EVENT_COUNTERS=y +CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -160,6 +161,7 @@ CONFIG_X86_MCE_AMD=y # CONFIG_CRASH_DUMP is not set CONFIG_PHYSICAL_START=0x200000 CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set @@ -307,18 +309,23 @@ CONFIG_IP_PNP_DHCP=y CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set -CONFIG_TCP_CONG_BIC=y +CONFIG_TCP_CONG_CUBIC=y +CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set +# CONFIG_IPV6_MIP6 is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set # CONFIG_INET6_XFRM_MODE_TRANSPORT is not set # CONFIG_INET6_XFRM_MODE_TUNNEL is not set +# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set # CONFIG_IPV6_TUNNEL is not set +# CONFIG_IPV6_SUBTREES is not set +# CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set @@ -345,7 +352,6 @@ CONFIG_IPV6=y # CONFIG_ATALK is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set @@ -487,6 +493,7 @@ CONFIG_IDEDMA_AUTO=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set # @@ -508,12 +515,13 @@ CONFIG_SCSI_CONSTANTS=y # CONFIG_SCSI_LOGGING is not set # -# SCSI Transport Attributes +# SCSI Transports # CONFIG_SCSI_SPI_ATTRS=y CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=y +# CONFIG_SCSI_SAS_LIBSAS is not set # # SCSI low-level drivers @@ -532,29 +540,14 @@ CONFIG_AIC79XX_RESET_DELAY_MS=4000 # CONFIG_AIC79XX_DEBUG_ENABLE is not set CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +# CONFIG_SCSI_AIC94XX is not set +# CONFIG_SCSI_ARCMSR is not set CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=y CONFIG_MEGARAID_MAILBOX=y # CONFIG_MEGARAID_LEGACY is not set CONFIG_MEGARAID_SAS=y -CONFIG_SCSI_SATA=y -CONFIG_SCSI_SATA_AHCI=y -CONFIG_SCSI_SATA_SVW=y -CONFIG_SCSI_ATA_PIIX=y -# CONFIG_SCSI_SATA_MV is not set -CONFIG_SCSI_SATA_NV=y -# CONFIG_SCSI_PDC_ADMA is not set # CONFIG_SCSI_HPTIOP is not set -# CONFIG_SCSI_SATA_QSTOR is not set -# CONFIG_SCSI_SATA_PROMISE is not set -# CONFIG_SCSI_SATA_SX4 is not set -CONFIG_SCSI_SATA_SIL=y -# CONFIG_SCSI_SATA_SIL24 is not set -# CONFIG_SCSI_SATA_SIS is not set -# CONFIG_SCSI_SATA_ULI is not set -CONFIG_SCSI_SATA_VIA=y -# CONFIG_SCSI_SATA_VITESSE is not set -CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set @@ -563,6 +556,7 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_IPS is not set # CONFIG_SCSI_INITIO is not set # CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_STEX is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set # CONFIG_SCSI_QLOGIC_1280 is not set @@ -572,6 +566,62 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_DEBUG is not set +# +# Serial ATA (prod) and Parallel ATA (experimental) drivers +# +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SVW=y +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +CONFIG_SATA_NV=y +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +CONFIG_SATA_SIL=y +# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +CONFIG_SATA_VIA=y +# CONFIG_SATA_VITESSE is not set +CONFIG_SATA_INTEL_COMBINED=y +# CONFIG_PATA_ALI is not set +# CONFIG_PATA_AMD is not set +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_LEGACY is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MPIIX is not set +# CONFIG_PATA_OLDPIIX is not set +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_QDI is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set + # # Multi-device support (RAID and LVM) # @@ -678,6 +728,7 @@ CONFIG_NET_PCI=y # CONFIG_ADAPTEC_STARFIRE is not set CONFIG_B44=y CONFIG_FORCEDETH=y +# CONFIG_FORCEDETH_NAPI is not set # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y @@ -714,6 +765,7 @@ CONFIG_E1000=y # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y CONFIG_BNX2=y +# CONFIG_QLA3XXX is not set # # Ethernet (10000 Mbit) @@ -1036,6 +1088,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y +CONFIG_OSS_OBSOLETE_DRIVER=y # CONFIG_SOUND_BT878 is not set # CONFIG_SOUND_EMU10K1 is not set # CONFIG_SOUND_FUSION is not set @@ -1046,7 +1099,6 @@ CONFIG_SOUND_ICH=y # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_VIA82CXXX is not set # CONFIG_SOUND_OSS is not set -# CONFIG_SOUND_TVMIXER is not set # # USB support @@ -1203,7 +1255,6 @@ CONFIG_USB_MON=y # InfiniBand support # # CONFIG_INFINIBAND is not set -# CONFIG_IPATH_CORE is not set # # EDAC - error detection and reporting (RAS) (EXPERIMENTAL) @@ -1448,10 +1499,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y # # CONFIG_CRYPTO is not set -# -# Hardware crypto devices -# - # # Library routines # -- cgit v1.2.3 From b07f8915cda3fcd73b8b68075ba1e6cd0673365d Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Temporarily revert parts of the Core 2 nmi nmi watchdog support This makes merging easier. They are readded a few patches later. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/nmi.c | 65 +------------------------- arch/x86_64/kernel/nmi.c | 81 ++------------------------------- include/asm-i386/intel_arch_perfmon.h | 19 -------- include/asm-x86_64/intel_arch_perfmon.h | 19 -------- 4 files changed, 6 insertions(+), 178 deletions(-) delete mode 100644 include/asm-i386/intel_arch_perfmon.h delete mode 100644 include/asm-x86_64/intel_arch_perfmon.h (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index acb351478e42..1282d70ff971 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -24,7 +24,6 @@ #include <asm/smp.h> #include <asm/nmi.h> -#include <asm/intel_arch_perfmon.h> #include "mach_traps.h" @@ -96,9 +95,6 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -211,8 +207,6 @@ static int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_intel_arch_watchdog(void); - static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -222,10 +216,6 @@ static void disable_lapic_nmi_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, 0, 0); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - disable_intel_arch_watchdog(); - break; - } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -454,53 +444,6 @@ static int setup_p4_watchdog(void) return 1; } -static void disable_intel_arch_watchdog(void) -{ - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); -} - -static int setup_intel_arch_watchdog(void) -{ - unsigned int evntsel; - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - - clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); - clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - write_watchdog_counter("INTEL_ARCH_PERFCTR0"); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - return 1; -} - void setup_apic_nmi_watchdog (void) { switch (boot_cpu_data.x86_vendor) { @@ -510,11 +453,6 @@ void setup_apic_nmi_watchdog (void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -619,8 +557,7 @@ void nmi_watchdog_tick (struct pt_regs * regs) wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); } - else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 || - nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { /* Only P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt * other P6 variant */ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 5baa0c726e97..42c05d6907b9 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -26,7 +26,6 @@ #include <asm/proto.h> #include <asm/kdebug.h> #include <asm/mce.h> -#include <asm/intel_arch_perfmon.h> /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -66,9 +65,6 @@ static unsigned int nmi_p4_cccr_val; #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - #define MSR_P4_MISC_ENABLE 0x1A0 #define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) @@ -100,10 +96,7 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return (boot_cpu_data.x86 == 15); + return boot_cpu_data.x86 == 15; } return 0; } @@ -209,8 +202,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_intel_arch_watchdog(void); - static void disable_lapic_nmi_watchdog(void) { if (nmi_active <= 0) @@ -223,8 +214,6 @@ static void disable_lapic_nmi_watchdog(void) if (boot_cpu_data.x86 == 15) { wrmsr(MSR_P4_IQ_CCCR0, 0, 0); wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - disable_intel_arch_watchdog(); } break; } @@ -377,53 +366,6 @@ static void setup_k7_watchdog(void) wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); } -static void disable_intel_arch_watchdog(void) -{ - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); -} - -static int setup_intel_arch_watchdog(void) -{ - unsigned int evntsel; - unsigned ebx; - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebp indicates event present. - */ - ebx = cpuid_ebx(10); - if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - - clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); - clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); - return 1; -} - static int setup_p4_watchdog(void) { @@ -477,16 +419,10 @@ void setup_apic_nmi_watchdog(void) setup_k7_watchdog(); break; case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - } else if (boot_cpu_data.x86 == 15) { - if (!setup_p4_watchdog()) - return; - } else { + if (boot_cpu_data.x86 != 15) + return; + if (!setup_p4_watchdog()) return; - } - break; default: @@ -571,14 +507,7 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) */ wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); apic_write(APIC_LVTPC, APIC_DM_NMI); - } else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { - /* - * For Intel based architectural perfmon - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - } + } wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); } } diff --git a/include/asm-i386/intel_arch_perfmon.h b/include/asm-i386/intel_arch_perfmon.h deleted file mode 100644 index 134ea9cc5283..000000000000 --- a/include/asm-i386/intel_arch_perfmon.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef X86_INTEL_ARCH_PERFMON_H -#define X86_INTEL_ARCH_PERFMON_H 1 - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0) - -#endif /* X86_INTEL_ARCH_PERFMON_H */ diff --git a/include/asm-x86_64/intel_arch_perfmon.h b/include/asm-x86_64/intel_arch_perfmon.h deleted file mode 100644 index 59c396431569..000000000000 --- a/include/asm-x86_64/intel_arch_perfmon.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef X86_64_INTEL_ARCH_PERFMON_H -#define X86_64_INTEL_ARCH_PERFMON_H 1 - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT (1 << 0) - -#endif /* X86_64_INTEL_ARCH_PERFMON_H */ -- cgit v1.2.3 From 828f0afda123a96ff4e8078f057a302f4b4232ae Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Add performance counter reservation framework for UP kernels Adds basic infrastructure to allow subsystems to reserve performance counters on the x86 chips. Only UP kernels are supported in this patch to make reviewing easier. The SMP portion makes a lot more changes. Think of this as a locking mechanism where each bit represents a different counter. In addition, each subsystem should also reserve an appropriate event selection register that will correspond to the performance counter it will be using (this is mainly neccessary for the Pentium 4 chips as they break the 1:1 relationship to performance counters). This will help prevent subsystems like oprofile from interfering with the nmi watchdog. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/nmi.c | 188 +++++++++++++++++++++++++++++++++++++++-------- arch/x86_64/kernel/nmi.c | 178 ++++++++++++++++++++++++++++++++++---------- include/asm-i386/nmi.h | 7 ++ include/asm-x86_64/nmi.h | 8 +- 4 files changed, 308 insertions(+), 73 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 1282d70ff971..5d58dfeacd59 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -34,6 +34,20 @@ static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * - it may be reserved by some other driver, or not @@ -95,6 +109,105 @@ int nmi_active; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_PERFCTR0); + case 15: + return (msr - MSR_P4_BPU_PERFCTR0); + } + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_EVNTSEL0); + case 15: + return (msr - MSR_P4_BSU_ESCR0); + } + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0])) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]); +} + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -344,14 +457,6 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ -static void clear_msr_range(unsigned int base, unsigned int n) -{ - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - static void write_watchdog_counter(const char *descr) { u64 count = (u64)cpu_khz * 1000; @@ -362,14 +467,19 @@ static void write_watchdog_counter(const char *descr) wrmsrl(nmi_perfctr_msr, 0 - count); } -static void setup_k7_watchdog(void) +static int setup_k7_watchdog(void) { unsigned int evntsel; nmi_perfctr_msr = MSR_K7_PERFCTR0; - clear_msr_range(MSR_K7_EVNTSEL0, 4); - clear_msr_range(MSR_K7_PERFCTR0, 4); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + goto fail1; + + wrmsrl(MSR_K7_PERFCTR0, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS @@ -381,16 +491,24 @@ static void setup_k7_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } -static void setup_p6_watchdog(void) +static int setup_p6_watchdog(void) { unsigned int evntsel; nmi_perfctr_msr = MSR_P6_PERFCTR0; - clear_msr_range(MSR_P6_EVNTSEL0, 2); - clear_msr_range(MSR_P6_PERFCTR0, 2); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0)) + goto fail1; evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS @@ -402,6 +520,11 @@ static void setup_p6_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); + return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } static int setup_p4_watchdog(void) @@ -419,22 +542,11 @@ static int setup_p4_watchdog(void) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; #endif - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); - } else { - clear_msr_range(0x3A0, 31); - } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail1; wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); @@ -442,6 +554,10 @@ static int setup_p4_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } void setup_apic_nmi_watchdog (void) @@ -450,7 +566,8 @@ void setup_apic_nmi_watchdog (void) case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) return; - setup_k7_watchdog(); + if (!setup_k7_watchdog()) + return; break; case X86_VENDOR_INTEL: switch (boot_cpu_data.x86) { @@ -458,7 +575,8 @@ void setup_apic_nmi_watchdog (void) if (boot_cpu_data.x86_model > 0xd) return; - setup_p6_watchdog(); + if(!setup_p6_watchdog()) + return; break; case 15: if (boot_cpu_data.x86_model > 0x4) @@ -612,6 +730,12 @@ int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(reserve_lapic_nmi); EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 42c05d6907b9..b7a7c9973849 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -27,6 +27,20 @@ #include <asm/kdebug.h> #include <asm/mce.h> +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: * - it may be reserved by some other driver, or not @@ -90,6 +104,95 @@ static unsigned int nmi_p4_cccr_val; (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + return (msr - MSR_P4_BPU_PERFCTR0); + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + return (msr - MSR_P4_BSU_ESCR0); + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner))) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)); +} + static __cpuinit inline int nmi_known_cpu(void) { switch (boot_cpu_data.x86_vendor) { @@ -325,34 +428,22 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -static void clear_msr_range(unsigned int base, unsigned int n) +static int setup_k7_watchdog(void) { - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - -static void setup_k7_watchdog(void) -{ - int i; unsigned int evntsel; nmi_perfctr_msr = MSR_K7_PERFCTR0; - for(i = 0; i < 4; ++i) { - /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { - nmi_perfctr_msr = 0; - return; - } - wrmsrl(MSR_K7_PERFCTR0+i, 0UL); - } + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + goto fail1; + + /* Simulator may not support it */ + if (checking_wrmsrl(MSR_K7_EVNTSEL0, 0UL)) + goto fail2; + wrmsrl(MSR_K7_PERFCTR0, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS @@ -364,6 +455,13 @@ static void setup_k7_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + return 1; +fail2: + release_evntsel_nmi(MSR_K7_EVNTSEL0); +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } @@ -382,22 +480,11 @@ static int setup_p4_watchdog(void) nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; #endif - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); - } else { - clear_msr_range(0x3A0, 31); - } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); + if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + goto fail1; wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); @@ -406,6 +493,10 @@ static int setup_p4_watchdog(void) apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; +fail1: + release_perfctr_nmi(nmi_perfctr_msr); +fail: + return 0; } void setup_apic_nmi_watchdog(void) @@ -416,7 +507,8 @@ void setup_apic_nmi_watchdog(void) return; if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) return; - setup_k7_watchdog(); + if (!setup_k7_watchdog()) + return; break; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 != 15) @@ -588,6 +680,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(reserve_lapic_nmi); EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 67d994799999..27fc9e6f630e 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -25,6 +25,13 @@ void set_nmi_callback(nmi_callback_t callback); */ void unset_nmi_callback(void); +extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); +extern int avail_to_resrv_perfctr_nmi(unsigned int); +extern int reserve_perfctr_nmi(unsigned int); +extern void release_perfctr_nmi(unsigned int); +extern int reserve_evntsel_nmi(unsigned int); +extern void release_evntsel_nmi(unsigned int); + extern void setup_apic_nmi_watchdog (void); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index efb45c894d76..62a784cb8f0c 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -56,7 +56,13 @@ extern int panic_on_timeout; extern int unknown_nmi_panic; extern int check_nmi_watchdog(void); - +extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); +extern int avail_to_resrv_perfctr_nmi(unsigned int); +extern int reserve_perfctr_nmi(unsigned int); +extern void release_perfctr_nmi(unsigned int); +extern int reserve_evntsel_nmi(unsigned int); +extern void release_evntsel_nmi(unsigned int); + extern void setup_apic_nmi_watchdog (void); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); -- cgit v1.2.3 From f2802e7f571c05f9a901b1f5bd144aa730ccc88e Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Add SMP support on x86_64 to reservation framework This patch includes the changes to make the nmi watchdog on x86_64 SMP aware. A bunch of code was moved around to make it simpler to read. In addition, it is now possible to determine if a particular NMI was the result of the watchdog or not. This feature allows the kernel to filter out unknown NMIs easier. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 3 +- arch/x86_64/kernel/nmi.c | 426 +++++++++++++++++++++++++++++++--------------- include/asm-x86_64/nmi.h | 3 +- 3 files changed, 290 insertions(+), 142 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 2b8cef037a65..692e9b579743 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -479,8 +479,7 @@ void __cpuinit setup_local_APIC (void) } nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index b7a7c9973849..d42374a952d7 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -56,53 +56,29 @@ static unsigned int lapic_nmi_owner; #define LAPIC_NMI_RESERVED (1<<1) /* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled */ -int nmi_active; /* oprofile uses this */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +struct nmi_watchdog_ctlblk { + int enabled; + u64 check_bit; + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* local prototypes */ +static void stop_apic_nmi_watchdog(void *unused); +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) @@ -241,6 +217,12 @@ int __init check_nmi_watchdog (void) int *counts; int cpu; + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!counts) return -1; @@ -258,19 +240,22 @@ int __init check_nmi_watchdog (void) mdelay((10*1000)/nmi_hz); // wait 10 ticks for_each_online_cpu(cpu) { + if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + continue; if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { - endflag = 1; printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, counts[cpu], cpu_pda(cpu)->__nmi_count); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - nmi_perfctr_msr = 0; - kfree(counts); - return -1; + per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + atomic_dec(&nmi_active); } } + if (!atomic_read(&nmi_active)) { + kfree(counts); + atomic_set(&nmi_active, -1); + return -1; + } endflag = 1; printk("OK.\n"); @@ -297,8 +282,11 @@ int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID) + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) return 0; + + if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0)) + return 0; /* no lapic support */ nmi_watchdog = nmi; return 1; } @@ -307,31 +295,30 @@ __setup("nmi_watchdog=", setup_nmi_watchdog); static void disable_lapic_nmi_watchdog(void) { - if (nmi_active <= 0) + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 15) { - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; + + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } static void enable_lapic_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - touch_nmi_watchdog(); - setup_apic_nmi_watchdog(); - } + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; + + /* are we lapic aware */ + if (nmi_known_cpu() <= 0) + return; + + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); } int reserve_lapic_nmi(void) @@ -363,21 +350,24 @@ void release_lapic_nmi(void) void disable_timer_nmi_watchdog(void) { - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) <= 0) return; disable_irq(0); - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } void enable_timer_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) == 0) { touch_nmi_watchdog(); - nmi_active = 1; + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); enable_irq(0); } } @@ -388,7 +378,7 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { - nmi_pm_active = nmi_active; + nmi_pm_active = atomic_read(&nmi_active); disable_lapic_nmi_watchdog(); return 0; } @@ -396,7 +386,7 @@ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) static int lapic_nmi_resume(struct sys_device *dev) { if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + enable_lapic_nmi_watchdog(); return 0; } @@ -415,7 +405,13 @@ static int __init init_lapic_nmi_sysfs(void) { int error; - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -428,100 +424,232 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ +/* + * Activate the NMI watchdog via the local APIC. + * Original code written by Keith Owens. + */ + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + static int setup_k7_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_K7_PERFCTR0; - - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0, 0UL)) + if (checking_wrmsrl(evntsel_msr, 0UL)) goto fail2; - wrmsrl(MSR_K7_PERFCTR0, 0UL); + wrmsrl(perfctr_msr, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<63; return 1; fail2: - release_evntsel_nmi(MSR_K7_EVNTSEL0); + release_evntsel_nmi(evntsel_msr); fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } +static void stop_k7_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ static int setup_p4_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) return 0; - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; #ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else #endif + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + } else { + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + } - if (!reserve_perfctr_nmi(nmi_perfctr_msr)) + if (!reserve_perfctr_nmi(perfctr_msr)) goto fail; - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) + if (!reserve_evntsel_nmi(evntsel_msr)) goto fail1; - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz)); - wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz)); + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + wd->check_bit = 1ULL<<39; return 1; fail1: - release_perfctr_nmi(nmi_perfctr_msr); + release_perfctr_nmi(perfctr_msr); fail: return 0; } -void setup_apic_nmi_watchdog(void) +static void stop_p4_watchdog(void) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 15) - return; - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 != 15) - return; - if (!setup_p4_watchdog()) + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +void setup_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + if (!setup_k7_watchdog()) + return; + break; + case X86_VENDOR_INTEL: + if (!setup_p4_watchdog()) + return; + break; + default: return; - break; + } + } + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + atomic_inc(&nmi_active); +} - default: - return; +static void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + stop_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + stop_p4_watchdog(); + break; + default: + return; + } } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; + __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + atomic_dec(&nmi_active); } /* @@ -558,50 +686,70 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 dummy; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + touched = 1; + } sum = read_pda(apic_timer_irqs); if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; } + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ if (atomic_read(&mce_entry) > 0) touched = 1; #endif + /* if the apic timer isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - local_set(&__get_cpu_var(alert_counter), 0); - return; - } + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); - } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + /* see if the nmi watchdog went off */ + if (wd->enabled) { + if (nmi_watchdog == NMI_LOCAL_APIC) { + rdmsrl(wd->perfctr_msr, dummy); + if (dummy & wd->check_bit){ + /* this wasn't a watchdog timer interrupt */ + goto done; + } + + /* only Intel uses the cccr msr */ + if (wd->cccr_msr != 0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + } } +done: + return; } static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 62a784cb8f0c..5918136fd853 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -63,7 +63,7 @@ extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -extern void setup_apic_nmi_watchdog (void); +extern void setup_apic_nmi_watchdog (void *); extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); @@ -73,6 +73,7 @@ extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); extern void nmi_watchdog_default(void); extern int setup_nmi_watchdog(char *); +extern atomic_t nmi_active; extern unsigned int nmi_watchdog; #define NMI_DEFAULT -1 #define NMI_NONE 0 -- cgit v1.2.3 From 3adbbcce9a49b900d4cc118cdccfdefa78bf1afb Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] x86: Cleanup NMI interrupt path This patch cleans up the NMI interrupt path. Instead of being gated by if the 'nmi callback' is set, the interrupt handler now calls everyone who is registered on the die_chain and additionally checks the nmi watchdog, reseting it if enabled. This allows more subsystems to hook into the NMI if they need to (without being block by set_nmi_callback). Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/nmi.c | 16 +++++++++++++--- arch/i386/kernel/traps.c | 24 +++++++++++------------- arch/x86_64/kernel/nmi.c | 26 +++++++++++++++++++------- arch/x86_64/kernel/traps.c | 8 ++++---- include/asm-i386/nmi.h | 2 +- include/asm-x86_64/nmi.h | 10 +++++++++- 6 files changed, 57 insertions(+), 29 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index d88004343034..bd96ea4f2942 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -781,7 +781,7 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) +int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { /* @@ -794,10 +794,12 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) int cpu = smp_processor_id(); struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; + int rc=0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { + rc = 1; touched = 1; } @@ -850,10 +852,18 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) } /* start the cycle over again */ write_watchdog_counter(wd->perfctr_msr, NULL); - } + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + } else + printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); } done: - return; + return rc; } #ifdef CONFIG_SYSCTL diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 3a07b2677e2a..282f0bd40dfd 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -706,6 +706,13 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } +static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +{ + return 0; +} + +static nmi_callback_t nmi_callback = dummy_nmi_callback; + static void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -723,12 +730,11 @@ static void default_do_nmi(struct pt_regs * regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog) { - nmi_watchdog_tick(regs, reason); + if (nmi_watchdog_tick(regs, reason)) return; - } #endif - unknown_nmi_error(reason, regs); + if (!rcu_dereference(nmi_callback)(regs, smp_processor_id())) + unknown_nmi_error(reason, regs); return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -744,13 +750,6 @@ static void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - fastcall void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -761,8 +760,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) ++nmi_count(cpu); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index d42374a952d7..f6b881b23a70 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -682,16 +682,18 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } -void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; + int rc=0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { + rc = 1; touched = 1; } @@ -746,10 +748,18 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) } /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - } + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + } else + printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); } done: - return; + return rc; } static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) @@ -761,15 +771,17 @@ static nmi_callback_t nmi_callback = dummy_nmi_callback; asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { - int cpu = safe_smp_processor_id(); - nmi_enter(); add_pda(__nmi_count,1); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ + return rcu_dereference(nmi_callback)(regs, cpu); +} + void set_nmi_callback(nmi_callback_t callback) { vmalloc_sync_all(); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b1249774d1e8..42bc070fdf11 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -781,12 +781,12 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog > 0) { - nmi_watchdog_tick(regs,reason); + if (nmi_watchdog_tick(regs,reason)) return; - } + if (!do_nmi_callback(regs,cpu)) #endif - unknown_nmi_error(reason, regs); + unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 4cda6801ecb8..da0e0b4e9139 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -37,7 +37,7 @@ extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); -extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); +extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); extern atomic_t nmi_active; extern unsigned int nmi_watchdog; diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 5918136fd853..8f02a2a416e6 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -26,6 +26,14 @@ void set_nmi_callback(nmi_callback_t callback); */ void unset_nmi_callback(void); +/** + * do_nmi_callback + * + * Check to see if a callback exists and execute it. Return 1 + * if the handler exists and was handled successfully. + */ +int do_nmi_callback(struct pt_regs *regs, int cpu); + #ifdef CONFIG_PM /** Replace the PM callback routine for NMI. */ @@ -68,7 +76,7 @@ extern int reserve_lapic_nmi(void); extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); -extern void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); +extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); extern void nmi_watchdog_default(void); extern int setup_nmi_watchdog(char *); -- cgit v1.2.3 From 1d001df19d5323e642ba8ac821c713675ebccd82 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:26 +0200 Subject: [PATCH] Add TIF_RESTORE_SIGMASK We need TIF_RESTORE_SIGMASK in order to support ppoll() and pselect() system calls. This patch originally came from Andi, and was based heavily on David Howells' implementation of same on i386. I fixed a typo which was causing do_signal() to use the wrong signal mask. Signed-off-by: David Woodhouse <dwmw2@infradead.org> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ia32_signal.c | 28 ++++++-------- arch/x86_64/kernel/signal.c | 82 +++++++++++++++++----------------------- include/asm-x86_64/signal.h | 4 -- include/asm-x86_64/thread_info.h | 2 + include/asm-x86_64/unistd.h | 1 + 5 files changed, 49 insertions(+), 68 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 25e5ca22204c..549de439fb2d 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -113,25 +113,19 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) } asmlinkage long -sys32_sigsuspend(int history0, int history1, old_sigset_t mask, - struct pt_regs *regs) +sys32_sigsuspend(int history0, int history1, old_sigset_t mask) { - sigset_t saveset; - mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; + current->saved_sigmask = current->blocked; siginitset(¤t->blocked, mask); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_thread_flag(TIF_RESTORE_SIGMASK); + return -ERESTARTNOHAND; } asmlinkage long @@ -508,11 +502,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -595,7 +589,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->eflags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); @@ -604,9 +598,9 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 28161170fb0a..7f58bc9a056d 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -37,37 +37,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int ia32_setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs); -asmlinkage long -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -#ifdef DEBUG_SIG - printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", - saveset, newset, regs, regs->rip); -#endif - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) @@ -341,11 +310,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } /* @@ -408,7 +377,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret) { + if (ret == 0) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) @@ -425,11 +394,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) +static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; siginfo_t info; int signr; + sigset_t *oldset; /* * We want the common case to go fast, which @@ -438,9 +408,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * if so. */ if (!user_mode(regs)) - return 1; + return; - if (!oldset) + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + oldset = ¤t->saved_sigmask; + else oldset = ¤t->blocked; signr = get_signal_to_deliver(&info, &ka, regs, NULL); @@ -454,30 +426,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ - return handle_signal(signr, &info, &ka, oldset, regs); + if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + /* a signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TIF_RESTORE_SIGMASK flag */ + clear_thread_flag(TIF_RESTORE_SIGMASK); + } + return; } /* Did we come from a system call? */ if ((long)regs->orig_rax >= 0) { /* Restart the system call - no handlers present */ long res = regs->rax; - if (res == -ERESTARTNOHAND || - res == -ERESTARTSYS || - res == -ERESTARTNOINTR) { + switch (res) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: regs->rax = regs->orig_rax; regs->rip -= 2; - } - if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { + break; + case -ERESTART_RESTARTBLOCK: regs->rax = test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall; regs->rip -= 2; + break; } } - return 0; + + /* if there's no signal to deliver, we just put the saved sigmask + back. */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) { + clear_thread_flag(TIF_RESTORE_SIGMASK); + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } } -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", @@ -491,8 +479,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_ } /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); + if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) + do_signal(regs); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/include/asm-x86_64/signal.h b/include/asm-x86_64/signal.h index 3ede2a61973a..4581f978b299 100644 --- a/include/asm-x86_64/signal.h +++ b/include/asm-x86_64/signal.h @@ -24,10 +24,6 @@ typedef struct { } sigset_t; -struct pt_regs; -asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); - - #else /* Here we must cater to libcs that poke about in kernel headers. */ diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h index 2029b00351f3..790c512a4369 100644 --- a/include/asm-x86_64/thread_info.h +++ b/include/asm-x86_64/thread_info.h @@ -114,6 +114,7 @@ static inline struct thread_info *stack_thread_info(void) #define TIF_IRET 5 /* force IRET */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ /* 16 free */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -128,6 +129,7 @@ static inline struct thread_info *stack_thread_info(void) #define _TIF_IRET (1<<TIF_IRET) #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1<<TIF_SECCOMP) +#define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) #define _TIF_IA32 (1<<TIF_IA32) #define _TIF_FORK (1<<TIF_FORK) #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING) diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 80fd48e84bbb..f266de294003 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -658,6 +658,7 @@ do { \ #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION +#define __ARCH_WANT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME -- cgit v1.2.3 From 957dc87c1bd849440f0eef27e2ade67387001e13 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] Add ppoll/pselect syscalls Needed TIF_RESTORE_SIGMASK first Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ia32entry.S | 4 ++-- include/asm-x86_64/unistd.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5d4a7d125ed0..30ed0f6f4a2b 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -703,8 +703,8 @@ ia32_sys_call_table: .quad sys_readlinkat /* 305 */ .quad sys_fchmodat .quad sys_faccessat - .quad quiet_ni_syscall /* pselect6 for now */ - .quad quiet_ni_syscall /* ppoll for now */ + .quad compat_sys_pselect6 + .quad compat_sys_ppoll .quad sys_unshare /* 310 */ .quad compat_sys_set_robust_list .quad compat_sys_get_robust_list diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index f266de294003..eeb98c168e98 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -600,9 +600,9 @@ __SYSCALL(__NR_fchmodat, sys_fchmodat) #define __NR_faccessat 269 __SYSCALL(__NR_faccessat, sys_faccessat) #define __NR_pselect6 270 -__SYSCALL(__NR_pselect6, sys_ni_syscall) /* for now */ +__SYSCALL(__NR_pselect6, sys_pselect6) #define __NR_ppoll 271 -__SYSCALL(__NR_ppoll, sys_ni_syscall) /* for now */ +__SYSCALL(__NR_ppoll, sys_ppoll) #define __NR_unshare 272 __SYSCALL(__NR_unshare, sys_unshare) #define __NR_set_robust_list 273 -- cgit v1.2.3 From 2fbe7b25c8edaf2d10e6c1a4cc9f8afe714c4764 Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Remove un/set_nmi_callback and reserve/release_lapic_nmi functions Removes the un/set_nmi_callback and reserve/release_lapic_nmi functions as they are no longer needed. The various subsystems are modified to register with the die_notifier instead. Also includes compile fixes by Andrew Morton. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/crash.c | 20 +++++++- arch/i386/kernel/nmi.c | 85 ++++--------------------------- arch/i386/kernel/traps.c | 23 +-------- arch/i386/oprofile/nmi_int.c | 47 ++++++++++------- arch/i386/oprofile/nmi_timer_int.c | 33 +++++++++--- arch/x86_64/kernel/crash.c | 20 +++++++- arch/x86_64/kernel/nmi.c | 102 +++---------------------------------- include/asm-i386/nmi.h | 21 ++------ include/asm-x86_64/nmi.h | 21 -------- kernel/sysctl.c | 4 +- 10 files changed, 116 insertions(+), 260 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 5b96f038367f..736c76d6b31d 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -22,6 +22,8 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/kdebug.h> + #include <mach_ipi.h> @@ -93,9 +95,18 @@ static void crash_save_self(struct pt_regs *regs) #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; struct pt_regs fixed_regs; + int cpu; + + if (val != DIE_NMI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); /* Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get @@ -125,13 +136,18 @@ static void smp_send_nmi_allbutself(void) send_IPI_allbutself(NMI_VECTOR); } +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* Ensure the new callback function is set before sending * out the NMI */ diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index bd96ea4f2942..acd3fdea2a21 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -42,20 +42,6 @@ static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); */ #define NMI_MAX_COUNTER_BITS 66 -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. - */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) - /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -325,33 +311,6 @@ static void enable_lapic_nmi_watchdog(void) touch_nmi_watchdog(); } -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; - - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} - -void release_lapic_nmi(void) -{ - unsigned int new_owner; - - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); -} - void disable_timer_nmi_watchdog(void) { BUG_ON(nmi_watchdog != NMI_IO_APIC); @@ -866,6 +825,15 @@ done: return rc; } +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) @@ -873,37 +841,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(regs, buf); - } - return 0; -} - -/* - * proc handler for /proc/sys/kernel/unknown_nmi_panic - */ -int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - old_state = unknown_nmi_panic; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) - return 0; - - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } - } else { - release_lapic_nmi(); - unset_nmi_callback(); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(regs, buf); return 0; } @@ -917,7 +856,5 @@ EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 282f0bd40dfd..7db664d0b25c 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -706,13 +706,6 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - static void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -732,9 +725,10 @@ static void default_do_nmi(struct pt_regs * regs) */ if (nmi_watchdog_tick(regs, reason)) return; + if (!do_nmi_callback(regs, smp_processor_id())) #endif - if (!rcu_dereference(nmi_callback)(regs, smp_processor_id())) unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -765,19 +759,6 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) nmi_exit(); } -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; -} -EXPORT_SYMBOL_GPL(unset_nmi_callback); - #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 8710ca081b1e..b3610188bcf0 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -17,14 +17,15 @@ #include <asm/nmi.h> #include <asm/msr.h> #include <asm/apic.h> +#include <asm/kdebug.h> #include "op_counter.h" #include "op_x86_model.h" - + static struct op_x86_model_spec const * model; static struct op_msrs cpu_msrs[NR_CPUS]; static unsigned long saved_lvtpc[NR_CPUS]; - + static int nmi_start(void); static void nmi_stop(void); @@ -82,13 +83,24 @@ static void exit_driverfs(void) #define exit_driverfs() do { } while (0) #endif /* CONFIG_PM */ - -static int nmi_callback(struct pt_regs * regs, int cpu) +int profile_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) { - return model->check_ctrs(regs, &cpu_msrs[cpu]); + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + int cpu = smp_processor_id(); + + switch(val) { + case DIE_NMI: + if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; } - - + static void nmi_cpu_save_registers(struct op_msrs * msrs) { unsigned int const nr_ctrs = model->num_counters; @@ -174,27 +186,29 @@ static void nmi_cpu_setup(void * dummy) apic_write(APIC_LVTPC, APIC_DM_NMI); } +static struct notifier_block profile_exceptions_nb = { + .notifier_call = profile_exceptions_notify, + .next = NULL, + .priority = 0 +}; static int nmi_setup(void) { + int err=0; + if (!allocate_msrs()) return -ENOMEM; - /* We walk a thin line between law and rape here. - * We need to be careful to install our NMI handler - * without actually triggering any NMIs as this will - * break the core code horrifically. - */ - if (reserve_lapic_nmi() < 0) { + if ((err = register_die_notifier(&profile_exceptions_nb))){ free_msrs(); - return -EBUSY; + return err; } + /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ on_each_cpu(nmi_save_registers, NULL, 0, 1); on_each_cpu(nmi_cpu_setup, NULL, 0, 1); - set_nmi_callback(nmi_callback); nmi_enabled = 1; return 0; } @@ -250,8 +264,7 @@ static void nmi_shutdown(void) { nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); - unset_nmi_callback(); - release_lapic_nmi(); + unregister_die_notifier(&profile_exceptions_nb); free_msrs(); } diff --git a/arch/i386/oprofile/nmi_timer_int.c b/arch/i386/oprofile/nmi_timer_int.c index a33a73bb502d..93ca48f804ac 100644 --- a/arch/i386/oprofile/nmi_timer_int.c +++ b/arch/i386/oprofile/nmi_timer_int.c @@ -17,32 +17,49 @@ #include <asm/nmi.h> #include <asm/apic.h> #include <asm/ptrace.h> +#include <asm/kdebug.h> -static int nmi_timer_callback(struct pt_regs * regs, int cpu) +int profile_timer_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) { - oprofile_add_sample(regs, 0); - return 1; + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + switch(val) { + case DIE_NMI: + oprofile_add_sample(args->regs, 0); + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; } +static struct notifier_block profile_timer_exceptions_nb = { + .notifier_call = profile_timer_exceptions_notify, + .next = NULL, + .priority = 0 +}; + static int timer_start(void) { - disable_timer_nmi_watchdog(); - set_nmi_callback(nmi_timer_callback); + if (register_die_notifier(&profile_timer_exceptions_nb)) + return 1; return 0; } static void timer_stop(void) { - enable_timer_nmi_watchdog(); - unset_nmi_callback(); + unregister_die_notifier(&profile_timer_exceptions_nb); synchronize_sched(); /* Allow already-started NMIs to complete. */ } int __init op_nmi_timer_init(struct oprofile_operations * ops) { - if (atomic_read(&nmi_active) <= 0) + if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0)) return -ENODEV; ops->start = timer_start; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index d8d5750d6106..44c8af65325e 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -23,6 +23,7 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/mach_apic.h> +#include <asm/kdebug.h> /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; @@ -95,8 +96,18 @@ static void crash_save_self(struct pt_regs *regs) #ifdef CONFIG_SMP static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; + int cpu; + + if (val != DIE_NMI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); + /* * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get @@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void) * cpu hotplug shouldn't matter. */ +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* * Ensure the new callback function is set before sending diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index f6b881b23a70..9d175dcf3a2d 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -41,20 +41,6 @@ static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); */ #define NMI_MAX_COUNTER_BITS 66 -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. - */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) - /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled * <0: the lapic NMI watchdog has not been set up, and cannot @@ -321,33 +307,6 @@ static void enable_lapic_nmi_watchdog(void) touch_nmi_watchdog(); } -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; - - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} - -void release_lapic_nmi(void) -{ - unsigned int new_owner; - - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); -} - void disable_timer_nmi_watchdog(void) { BUG_ON(nmi_watchdog != NMI_IO_APIC); @@ -762,13 +721,6 @@ done: return rc; } -static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { nmi_enter(); @@ -779,21 +731,12 @@ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) int do_nmi_callback(struct pt_regs * regs, int cpu) { - return rcu_dereference(nmi_callback)(regs, cpu); -} - -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; } -EXPORT_SYMBOL_GPL(unset_nmi_callback); #ifdef CONFIG_SYSCTL @@ -802,37 +745,8 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); - } - return 0; -} - -/* - * proc handler for /proc/sys/kernel/unknown_nmi_panic - */ -int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - old_state = unknown_nmi_panic; - proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) - return 0; - - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } - } else { - release_lapic_nmi(); - unset_nmi_callback(); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf,regs); return 0; } @@ -846,8 +760,6 @@ EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); EXPORT_SYMBOL(reserve_evntsel_nmi); EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index da0e0b4e9139..34d6bf063b6e 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -6,24 +6,13 @@ #include <linux/pm.h> -struct pt_regs; - -typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu); - -/** - * set_nmi_callback - * - * Set a handler for an NMI. Only one handler may be - * set. Return 1 if the NMI was handled. - */ -void set_nmi_callback(nmi_callback_t callback); - /** - * unset_nmi_callback + * do_nmi_callback * - * Remove the handler previously set. + * Check to see if a callback exists and execute it. Return 1 + * if the handler exists and was handled successfully. */ -void unset_nmi_callback(void); +int do_nmi_callback(struct pt_regs *regs, int cpu); extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int avail_to_resrv_perfctr_nmi(unsigned int); @@ -33,8 +22,6 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); -extern int reserve_lapic_nmi(void); -extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 8f02a2a416e6..8818c39d34e0 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -7,25 +7,6 @@ #include <linux/pm.h> #include <asm/io.h> -struct pt_regs; - -typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu); - -/** - * set_nmi_callback - * - * Set a handler for an NMI. Only one handler may be - * set. Return 1 if the NMI was handled. - */ -void set_nmi_callback(nmi_callback_t callback); - -/** - * unset_nmi_callback - * - * Remove the handler previously set. - */ -void unset_nmi_callback(void); - /** * do_nmi_callback * @@ -72,8 +53,6 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); -extern int reserve_lapic_nmi(void); -extern void release_lapic_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 362a0cc37138..83f168361624 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,8 +76,6 @@ extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; -extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, - void __user *, size_t *, loff_t *); #endif /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -628,7 +626,7 @@ static ctl_table kern_table[] = { .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_unknown_nmi_panic, + .proc_handler = &proc_dointvec, }, #endif #if defined(CONFIG_X86) -- cgit v1.2.3 From 407984f1af259b31957c7c05075a454a751bb801 Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog with sysctl Adds a new /proc/sys/kernel/nmi call that will enable/disable the nmi watchdog. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/nmi.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/nmi.c | 48 ++++++++++++++++++++++++++++++++++++++++++++ include/asm-i386/nmi.h | 1 + include/asm-x86_64/nmi.h | 1 + include/linux/sysctl.h | 1 + kernel/sysctl.c | 11 ++++++++++ 6 files changed, 114 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index acd3fdea2a21..28065d0b71a9 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -846,6 +846,58 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) return 0; } +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0) { + printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EINVAL; + } + + if (nmi_watchdog == NMI_DEFAULT) { + if (nmi_known_cpu() > 0) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + /* FIXME + * for some reason these functions don't work + */ + printk("Can not enable/disable NMI on IO APIC\n"); + return -EINVAL; +#if 0 + if (nmi_watchdog_enabled) + enable_timer_nmi_watchdog(); + else + disable_timer_nmi_watchdog(); +#endif + } else { + printk( KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + #endif EXPORT_SYMBOL(nmi_active); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 9d175dcf3a2d..3a17411a9a19 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -750,6 +750,54 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) return 0; } +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0) { + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EINVAL; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) + { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else if (nmi_watchdog == NMI_IO_APIC) { + /* FIXME + * for some reason these functions don't work + */ + printk("Can not enable/disable NMI on IO APIC\n"); + return -EIO; +#if 0 + if (nmi_watchdog_enabled) + enable_timer_nmi_watchdog(); + else + disable_timer_nmi_watchdog(); +#endif + } else { + printk(KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + #endif EXPORT_SYMBOL(nmi_active); diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 34d6bf063b6e..13b5d8311bf7 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -14,6 +14,7 @@ */ int do_nmi_callback(struct pt_regs *regs, int cpu); +extern int nmi_watchdog_enabled; extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 8818c39d34e0..2c23b0df87d2 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -43,6 +43,7 @@ extern void die_nmi(char *str, struct pt_regs *regs); extern int panic_on_timeout; extern int unknown_nmi_panic; +extern int nmi_watchdog_enabled; extern int check_nmi_watchdog(void); extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 736ed917a4f8..ecb79ba52ae1 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -150,6 +150,7 @@ enum KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, + KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 83f168361624..040de6bd74dd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,6 +76,9 @@ extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; +int nmi_watchdog_enabled; +extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, + void __user *, size_t *, loff_t *); #endif /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -628,6 +631,14 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = KERN_NMI_WATCHDOG, + .procname = "nmi_watchdog", + .data = &nmi_watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_nmi_enabled, + }, #endif #if defined(CONFIG_X86) { -- cgit v1.2.3 From e33e89ab1a8d295de0500b697f4f31c3ceee9aa2 Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Add abilty to enable/disable nmi watchdog from procfs (update) Adds a new /proc/sys/kernel/nmi_watchdog call that will enable/disable the nmi watchdog. By entering a non-zero value here, a user can enable the nmi watchdog to monitor the online cpus in the system. By entering a zero value here, a user can disable the nmi watchdog and free up a performance counter which could then be utilized by the oprofile subsystem, otherwise oprofile may be short a counter when in use. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> --- Documentation/filesystems/proc.txt | 14 +++++++++----- arch/i386/kernel/nmi.c | 21 ++++----------------- arch/x86_64/kernel/nmi.c | 21 ++++----------------- 3 files changed, 17 insertions(+), 39 deletions(-) (limited to 'arch/x86_64') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 99902ae6804e..7db71d6fba82 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1124,11 +1124,15 @@ debugging information is displayed on console. NMI switch that most IA32 servers have fires unknown NMI up, for example. If a system hangs up, try pressing the NMI switch. -[NOTE] - This function and oprofile share a NMI callback. Therefore this function - cannot be enabled when oprofile is activated. - And NMI watchdog will be disabled when the value in this file is set to - non-zero. +nmi_watchdog +------------ + +Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero +the NMI watchdog is enabled and will continuously test all online cpus to +determine whether or not they are still functioning properly. + +Because the NMI watchdog shares registers with oprofile, by disabling the NMI +watchdog, oprofile may have more registers to utilize. 2.4 /proc/sys/vm - The virtual memory subsystem diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 28065d0b71a9..6241e4448cab 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -847,7 +847,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) } /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi */ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) @@ -861,8 +861,8 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, return 0; if (atomic_read(&nmi_active) < 0) { - printk(KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EINVAL; + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EIO; } if (nmi_watchdog == NMI_DEFAULT) { @@ -872,24 +872,11 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, nmi_watchdog = NMI_IO_APIC; } - if (nmi_watchdog == NMI_LOCAL_APIC) - { + if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - /* FIXME - * for some reason these functions don't work - */ - printk("Can not enable/disable NMI on IO APIC\n"); - return -EINVAL; -#if 0 - if (nmi_watchdog_enabled) - enable_timer_nmi_watchdog(); - else - disable_timer_nmi_watchdog(); -#endif } else { printk( KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 3a17411a9a19..dd57410dad51 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -167,7 +167,7 @@ static __cpuinit inline int nmi_known_cpu(void) } /* Run after command line and cpu_init init, but before all other checks */ -void __cpuinit nmi_watchdog_default(void) +void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -766,32 +766,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, if (atomic_read(&nmi_active) < 0) { printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); - return -EINVAL; + return -EIO; } /* if nmi_watchdog is not set yet, then set it */ nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - { + if (nmi_watchdog == NMI_LOCAL_APIC) { if (nmi_watchdog_enabled) enable_lapic_nmi_watchdog(); else disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - /* FIXME - * for some reason these functions don't work - */ - printk("Can not enable/disable NMI on IO APIC\n"); - return -EIO; -#if 0 - if (nmi_watchdog_enabled) - enable_timer_nmi_watchdog(); - else - disable_timer_nmi_watchdog(); -#endif } else { - printk(KERN_WARNING + printk( KERN_WARNING "NMI watchdog doesn't know what hardware to touch\n"); return -EIO; } -- cgit v1.2.3 From 8da5adda91df3d2fcc5300e68da491694c9af019 Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: Allow users to force a panic on NMI To quote Alan Cox: The default Linux behaviour on an NMI of either memory or unknown is to continue operation. For many environments such as scientific computing it is preferable that the box is taken out and the error dealt with than an uncorrected parity/ECC error get propogated. A small number of systems do generate NMI's for bizarre random reasons such as power management so the default is unchanged. In other respects the new proc/sys entry works like the existing panic controls already in that directory. This is separate to the edac support - EDAC allows supported chipsets to handle ECC errors well, this change allows unsupported cases to at least panic rather than cause problems further down the line. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/traps.c | 6 ++++++ arch/x86_64/kernel/traps.c | 6 ++++++ include/linux/kernel.h | 1 + include/linux/sysctl.h | 1 + kernel/panic.c | 1 + kernel/sysctl.c | 8 ++++++++ 6 files changed, 23 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7db664d0b25c..2f6cb8276480 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -635,6 +635,8 @@ static void mem_parity_error(unsigned char reason, struct pt_regs * regs) "to continue\n"); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ clear_mem_error(reason); @@ -670,6 +672,10 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) reason, smp_processor_id()); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 42bc070fdf11..b18829db2a6a 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -732,6 +732,8 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); printk("You probably have a hardware problem with your RAM chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -757,6 +759,10 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + } /* Runs on IST stack. This code must keep interrupts off all the time. diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2b2ae4fdce8b..1ff9609300b4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -186,6 +186,7 @@ extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; +extern int panic_on_unrecovered_nmi; extern int tainted; extern const char *print_tainted(void); extern void add_taint(unsigned); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index ecb79ba52ae1..432778446ad2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -151,6 +151,7 @@ enum KERN_COMPAT_LOG=73, /* int: print compat layer messages */ KERN_MAX_LOCK_DEPTH=74, KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ + KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ }; diff --git a/kernel/panic.c b/kernel/panic.c index 8010b9b17aca..d2db3e2209e0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -21,6 +21,7 @@ #include <linux/debug_locks.h> int panic_on_oops; +int panic_on_unrecovered_nmi; int tainted; static int pause_on_oops; static int pause_on_oops_flag; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 040de6bd74dd..220e20564124 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -641,6 +641,14 @@ static ctl_table kern_table[] = { }, #endif #if defined(CONFIG_X86) + { + .ctl_name = KERN_PANIC_ON_NMI, + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", -- cgit v1.2.3 From c41c5cd3b20a2d81c30498f13b1527847a8fdf69 Mon Sep 17 00:00:00 2001 From: Don Zickus <dzickus@redhat.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: x86 clean up nmi panic messages Clean up some of the output messages on the nmi error paths to make more sense when they are displayed. This is mainly a cosmetic fix and shouldn't impact any normal code path. Signed-off-by: Don Zickus <dzickus@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/traps.c | 15 ++++++++------- arch/x86_64/kernel/traps.c | 21 ++++++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 2f6cb8276480..3c85c89f68d8 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -631,13 +631,15 @@ gp_in_kernel: static void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying " - "to continue\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + /* Clear and disable the memory parity error line. */ clear_mem_error(reason); } @@ -668,14 +670,13 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) return; } #endif - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); - + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } static DEFINE_SPINLOCK(nmi_print_lock); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index b18829db2a6a..dae10df60926 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -730,10 +730,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, static __kprobes void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "You probably have a hardware problem with your " + "RAM chips\n"); + if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -756,13 +761,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs) static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); +{ + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + panic("NMI: Not continuing"); + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } /* Runs on IST stack. This code must keep interrupts off all the time. -- cgit v1.2.3 From 4038f901cf102a40715b900984ed7540a9fa637f Mon Sep 17 00:00:00 2001 From: Shaohua Li <shaohua.li@intel.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] i386/x86-64: Fix NMI watchdog suspend/resume Making NMI suspend/resume work with SMP. We use CPU hotplug to offline APs in SMP suspend/resume. Only BSP executes sysdev's .suspend/.resume method. APs should follow CPU hotplug code path. And: +From: Don Zickus <dzickus@redhat.com> Makes the start/stop paths of nmi watchdog more robust to handle the suspend/resume cases more gracefully. AK: I merged the two patches together Signed-off-by: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Don Zickus <dzickus@redhat.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> --- arch/i386/kernel/nmi.c | 33 ++++++++++++++++++++++++++------- arch/i386/kernel/smpboot.c | 3 ++- arch/x86_64/kernel/nmi.c | 33 ++++++++++++++++++++++++++------- arch/x86_64/kernel/smpboot.c | 2 ++ include/asm-i386/nmi.h | 1 + include/asm-x86_64/nmi.h | 1 + 6 files changed, 58 insertions(+), 15 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 6241e4448cab..8e4ed930ce6b 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -63,7 +63,6 @@ struct nmi_watchdog_ctlblk { static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); /* local prototypes */ -static void stop_apic_nmi_watchdog(void *unused); static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); extern void show_registers(struct pt_regs *regs); @@ -341,15 +340,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { + /* only CPU0 goes here, other CPUs should be offline */ nmi_pm_active = atomic_read(&nmi_active); - disable_lapic_nmi_watchdog(); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -626,11 +630,21 @@ static void stop_p4_watchdog(void) void setup_apic_nmi_watchdog (void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -663,17 +677,22 @@ void setup_apic_nmi_watchdog (void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + wd->enabled = 1; atomic_inc(&nmi_active); } -static void stop_apic_nmi_watchdog(void *unused) +void stop_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -697,7 +716,7 @@ static void stop_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + wd->enabled = 0; atomic_dec(&nmi_active); } diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index efe07990e7fc..9367af76ce37 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1376,7 +1376,8 @@ int __cpu_disable(void) */ if (cpu == 0) return -EBUSY; - + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index dd57410dad51..5a35975e5763 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -63,7 +63,6 @@ struct nmi_watchdog_ctlblk { static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); /* local prototypes */ -static void stop_apic_nmi_watchdog(void *unused); static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); /* converts an msr to an appropriate reservation bit */ @@ -337,15 +336,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { + /* only CPU0 goes here, other CPUs should be offline */ nmi_pm_active = atomic_read(&nmi_active); - disable_lapic_nmi_watchdog(); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -561,11 +565,21 @@ static void stop_p4_watchdog(void) void setup_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -582,17 +596,22 @@ void setup_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 1; + wd->enabled = 1; atomic_inc(&nmi_active); } -static void stop_apic_nmi_watchdog(void *unused) +void stop_apic_nmi_watchdog(void *unused) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; + if (wd->enabled == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: @@ -607,7 +626,7 @@ static void stop_apic_nmi_watchdog(void *unused) return; } } - __get_cpu_var(nmi_watchdog_ctlblk.enabled) = 0; + wd->enabled = 0; atomic_dec(&nmi_active); } diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 975380207b46..15555879ce95 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1233,6 +1233,8 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* diff --git a/include/asm-i386/nmi.h b/include/asm-i386/nmi.h index 13b5d8311bf7..303bcd4592bb 100644 --- a/include/asm-i386/nmi.h +++ b/include/asm-i386/nmi.h @@ -23,6 +23,7 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); +extern void stop_apic_nmi_watchdog (void *); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 2c23b0df87d2..578596494275 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -54,6 +54,7 @@ extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); extern void setup_apic_nmi_watchdog (void *); +extern void stop_apic_nmi_watchdog (void *); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); extern int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason); -- cgit v1.2.3 From fac58550e80c307bf17cfa0dd544fca4eff120a5 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] Fix up panic messages for different NMI panics When a unknown NMI happened the panic would claim a NMI watchdog timeout. Also it would check the variable set by nmi_watchdog=panic and panic then. Fix up the panic message to be generic Unconditionally panic on unknown NMI when panic on unknown nmi is enabled. Noticed by Jan Beulich Cc: jbeulich@novell.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/nmi.c | 5 +++-- arch/x86_64/kernel/traps.c | 7 +++---- include/asm-x86_64/nmi.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 5a35975e5763..1b76d1574529 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -695,7 +695,8 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) - die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, + panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -765,7 +766,7 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) char buf[64]; sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); + die_nmi(buf, regs, 1); /* Always panic here */ return 0; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index dae10df60926..96f62a033242 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -570,7 +570,7 @@ void die(const char * str, struct pt_regs * regs, long err) do_exit(SIGSEGV); } -void __kprobes die_nmi(char *str, struct pt_regs *regs) +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags = oops_begin(); @@ -582,9 +582,8 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); - printk("console shuts up ...\n"); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); oops_end(flags); nmi_exit(); local_irq_enable(); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 578596494275..cbf2669bca71 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -37,7 +37,7 @@ static inline void unset_nmi_pm_callback(struct pm_dev * dev) #endif /* CONFIG_PM */ extern void default_do_nmi(struct pt_regs *); -extern void die_nmi(char *str, struct pt_regs *regs); +extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); #define get_nmi_reason() inb(0x61) -- cgit v1.2.3 From 3f22c5789eb76fd9aabe4be37ba609c793f046f9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal <vgoyal@in.ibm.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] kdump x86_64 nmi event notification fix After a crash we should wait for NMI IPI event and not for external NMI or NMI watchdog tick. Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Don Zickus <dzickus@redhat.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> --- arch/x86_64/kernel/crash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 44c8af65325e..fc57a139123e 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -102,7 +102,7 @@ static int crash_nmi_callback(struct notifier_block *self, struct pt_regs *regs; int cpu; - if (val != DIE_NMI) + if (val != DIE_NMI_IPI) return NOTIFY_OK; regs = ((struct die_args *)data)->regs; @@ -114,7 +114,7 @@ static int crash_nmi_callback(struct notifier_block *self, * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return 1; + return NOTIFY_STOP; local_irq_disable(); crash_save_this_cpu(regs, cpu); -- cgit v1.2.3 From 248dcb2ffffe8f3e4a369556a68988788c208111 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Date: Tue, 26 Sep 2006 10:52:27 +0200 Subject: [PATCH] x86: i386/x86-64 Add nmi watchdog support for new Intel CPUs AK: This redoes the changes I temporarily reverted. Intel now has support for Architectural Performance Monitoring Counters ( Refer to IA-32 Intel Architecture Software Developer's Manual http://www.intel.com/design/pentium4/manuals/253669.htm ). This feature is present starting from Intel Core Duo and Intel Core Solo processors. What this means is, the performance monitoring counters and some performance monitoring events are now defined in an architectural way (using cpuid). And there will be no need to check for family/model etc for these architectural events. Below is the patch to use this performance counters in nmi watchdog driver. Patch handles both i386 and x86-64 kernels. Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/nmi.c | 126 +++++++++++++++++++++++++++++-- arch/x86_64/kernel/nmi.c | 130 ++++++++++++++++++++++++++++++-- include/asm-i386/intel_arch_perfmon.h | 31 ++++++++ include/asm-x86_64/intel_arch_perfmon.h | 31 ++++++++ 4 files changed, 308 insertions(+), 10 deletions(-) create mode 100644 include/asm-i386/intel_arch_perfmon.h create mode 100644 include/asm-x86_64/intel_arch_perfmon.h (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 6e5085d5d2f6..7b9a053effa3 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -26,6 +26,7 @@ #include <asm/smp.h> #include <asm/nmi.h> #include <asm/kdebug.h> +#include <asm/intel_arch_perfmon.h> #include "mach_traps.h" @@ -77,6 +78,9 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_PERFCTR0); case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + switch (boot_cpu_data.x86) { case 6: return (msr - MSR_P6_PERFCTR0); @@ -95,6 +99,9 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_EVNTSEL0); case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + switch (boot_cpu_data.x86) { case 6: return (msr - MSR_P6_EVNTSEL0); @@ -174,7 +181,10 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); case X86_VENDOR_INTEL: - return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); } return 0; } @@ -261,8 +271,24 @@ static int __init check_nmi_watchdog(void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + nmi_hz = count + 1; + } + } kfree(prev_nmi_count); return 0; @@ -637,6 +663,85 @@ static void stop_p4_watchdog(void) release_perfctr_nmi(wd->perfctr_msr); } +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + void setup_apic_nmi_watchdog (void *unused) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); @@ -663,6 +768,11 @@ void setup_apic_nmi_watchdog (void *unused) return; break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -708,6 +818,10 @@ void stop_apic_nmi_watchdog(void *unused) stop_k7_watchdog(); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } switch (boot_cpu_data.x86) { case 6: if (boot_cpu_data.x86_model > 0xd) @@ -831,10 +945,12 @@ int nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); } - else if (wd->perfctr_msr == MSR_P6_PERFCTR0) { - /* Only P6 based Pentium M need to re-unmask + else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* P6 based Pentium M need to re-unmask * the apic vector but it doesn't hurt - * other P6 variant */ + * other P6 variant. + * ArchPerfom/Core Duo also needs this */ apic_write(APIC_LVTPC, APIC_DM_NMI); } /* start the cycle over again */ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 1b76d1574529..4d6fb047952e 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -26,6 +26,7 @@ #include <asm/proto.h> #include <asm/kdebug.h> #include <asm/mce.h> +#include <asm/intel_arch_perfmon.h> /* perfctr_nmi_owner tracks the ownership of the perfctr registers: * evtsel_nmi_owner tracks the ownership of the event selection @@ -73,7 +74,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_PERFCTR0); case X86_VENDOR_INTEL: - return (msr - MSR_P4_BPU_PERFCTR0); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + else + return (msr - MSR_P4_BPU_PERFCTR0); } return 0; } @@ -86,7 +90,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case X86_VENDOR_AMD: return (msr - MSR_K7_EVNTSEL0); case X86_VENDOR_INTEL: - return (msr - MSR_P4_BSU_ESCR0); + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + else + return (msr - MSR_P4_BSU_ESCR0); } return 0; } @@ -160,7 +167,10 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - return boot_cpu_data.x86 == 15; + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return (boot_cpu_data.x86 == 15); } return 0; } @@ -246,8 +256,22 @@ int __init check_nmi_watchdog (void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + } kfree(counts); return 0; @@ -563,6 +587,87 @@ static void stop_p4_watchdog(void) release_perfctr_nmi(wd->perfctr_msr); } +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + void setup_apic_nmi_watchdog(void *unused) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); @@ -589,6 +694,11 @@ void setup_apic_nmi_watchdog(void *unused) return; break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } if (!setup_p4_watchdog()) return; break; @@ -620,6 +730,10 @@ void stop_apic_nmi_watchdog(void *unused) stop_k7_watchdog(); break; case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } stop_p4_watchdog(); break; default: @@ -724,7 +838,13 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) dummy &= ~P4_CCCR_OVF; wrmsrl(wd->cccr_msr, dummy); apic_write(APIC_LVTPC, APIC_DM_NMI); - } + } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* + * ArchPerfom/Core Duo needs to re-unmask + * the apic vector + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); rc = 1; diff --git a/include/asm-i386/intel_arch_perfmon.h b/include/asm-i386/intel_arch_perfmon.h new file mode 100644 index 000000000000..b52cd60a075b --- /dev/null +++ b/include/asm-i386/intel_arch_perfmon.h @@ -0,0 +1,31 @@ +#ifndef X86_INTEL_ARCH_PERFMON_H +#define X86_INTEL_ARCH_PERFMON_H 1 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +#endif /* X86_INTEL_ARCH_PERFMON_H */ diff --git a/include/asm-x86_64/intel_arch_perfmon.h b/include/asm-x86_64/intel_arch_perfmon.h new file mode 100644 index 000000000000..8633331420ec --- /dev/null +++ b/include/asm-x86_64/intel_arch_perfmon.h @@ -0,0 +1,31 @@ +#ifndef X86_64_INTEL_ARCH_PERFMON_H +#define X86_64_INTEL_ARCH_PERFMON_H 1 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +#endif /* X86_64_INTEL_ARCH_PERFMON_H */ -- cgit v1.2.3 From a670fad0adb1cc6202a607d250f10bd380593905 Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik <vojtech@suse.cz> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add initalization of the RDTSCP auxilliary values This patch adds initalization of the RDTSCP auxilliary values to CPU numbers to time.c. If RDTSCP is available, the MSRs are written with the respective values. It can be later used to initalize per-cpu timekeeping variables. AK: Some cleanups. Move externs into headers and fix CPU hotplug. Signed-off-by: Vojtech Pavlik <vojtech@suse.cz> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/smpboot.c | 2 ++ arch/x86_64/kernel/time.c | 47 +++++++++++++++++++++++++++++++++----------- include/asm-x86_64/proto.h | 1 + 3 files changed, 38 insertions(+), 12 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 15555879ce95..582896f7d420 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1181,6 +1181,8 @@ void __init smp_cpus_done(unsigned int max_cpus) #endif check_nmi_watchdog(); + + time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7a9b18224182..97b9e46d1992 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -24,6 +24,8 @@ #include <linux/device.h> #include <linux/sysdev.h> #include <linux/bcd.h> +#include <linux/notifier.h> +#include <linux/cpu.h> #include <linux/kallsyms.h> #include <linux/acpi.h> #ifdef CONFIG_ACPI @@ -49,7 +51,7 @@ static void cpufreq_delayed_get(void); extern void i8254_timer_resume(void); extern int using_apic_timer; -static char *time_init_gtod(void); +static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); @@ -893,11 +895,21 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; -void __init time_init(void) +static int __cpuinit +time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) { - char *timename; - char *gtod; + unsigned cpu = (unsigned long) hcpu; + if (action == CPU_ONLINE && + cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + unsigned p; + p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12); + write_rdtscp_aux(p); + } + return NOTIFY_DONE; +} +void __init time_init(void) +{ if (nohpet) vxtime.hpet_address = 0; @@ -931,18 +943,19 @@ void __init time_init(void) } vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz); + + hotcpu_notifier(time_cpu_notifier, 0); + time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); + +#ifndef CONFIG_SMP + time_init_gtod(); +#endif } /* @@ -973,12 +986,13 @@ __cpuinit int unsynchronized_tsc(void) /* * Decide what mode gettimeofday should use. */ -__init static char *time_init_gtod(void) +void time_init_gtod(void) { char *timetype; if (unsynchronized_tsc()) notsc = 1; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) @@ -1001,7 +1015,16 @@ __init static char *time_init_gtod(void) timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } - return timetype; + + printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", + vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; + vxtime.last_tsc = get_cycles_sync(); + + set_cyc2ns_scale(cpu_khz); } __setup("report_lost_ticks", time_setup); diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 038fe1f47e6f..3b1c60247902 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -51,6 +51,7 @@ extern unsigned long long monotonic_base; extern int sysctl_vsyscall; extern int nohpet; extern unsigned long vxtime_hz; +extern void time_init_gtod(void); extern int numa_setup(char *opt); -- cgit v1.2.3 From c08c820508233b424deab3302bc404bbecc6493a Mon Sep 17 00:00:00 2001 From: Vojtech Pavlik <vojtech@suse.cz> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Add the vgetcpu vsyscall This patch adds a vgetcpu vsyscall, which depending on the CPU RDTSCP capability uses either the RDTSCP or CPUID to obtain a CPU and node numbers and pass them to the program. AK: Lots of changes over Vojtech's original code: Better prototype for vgetcpu() It's better to pass the cpu / node numbers as separate arguments to avoid mistakes when going from SMP to NUMA. Also add a fast time stamp based cache using a user supplied argument to speed things more up. Use fast method from Chuck Ebbert to retrieve node/cpu from GDT limit instead of CPUID Made sure RDTSCP init is always executed after node is known. Drop printk Signed-off-by: Vojtech Pavlik <vojtech@suse.cz> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/head.S | 2 +- arch/x86_64/kernel/time.c | 13 ++++--- arch/x86_64/kernel/vmlinux.lds.S | 3 ++ arch/x86_64/kernel/vsyscall.c | 84 +++++++++++++++++++++++++++++++++++++++- include/asm-x86_64/segment.h | 5 ++- include/asm-x86_64/smp.h | 12 ++++-- include/asm-x86_64/vsyscall.h | 9 +++++ include/linux/getcpu.h | 16 ++++++++ 8 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 include/linux/getcpu.h (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c9739ca81d06..505ec4a57506 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -371,7 +371,7 @@ ENTRY(cpu_gdt_table) .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused */ + .quad 0x0000f40000000000 /* node/CPU stored in limit */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 97b9e46d1992..560ed944dc0e 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -899,12 +899,8 @@ static int __cpuinit time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned cpu = (unsigned long) hcpu; - if (action == CPU_ONLINE && - cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { - unsigned p; - p = smp_processor_id() | (cpu_to_node(smp_processor_id())<<12); - write_rdtscp_aux(p); - } + if (action == CPU_ONLINE) + vsyscall_set_cpu(cpu); return NOTIFY_DONE; } @@ -993,6 +989,11 @@ void time_init_gtod(void) if (unsynchronized_tsc()) notsc = 1; + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7c4de31471d4..8d5a5149bb3a 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -99,6 +99,9 @@ SECTIONS .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } vxtime = VVIRT(.vxtime); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } wall_jiffies = VVIRT(.wall_jiffies); diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index f603037df162..902783bc4d53 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include <linux/seqlock.h> #include <linux/jiffies.h> #include <linux/sysctl.h> +#include <linux/getcpu.h> #include <asm/vsyscall.h> #include <asm/pgtable.h> @@ -33,11 +34,15 @@ #include <asm/fixmap.h> #include <asm/errno.h> #include <asm/io.h> +#include <asm/segment.h> +#include <asm/desc.h> +#include <asm/topology.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +int __vgetcpu_mode __section_vgetcpu_mode; #include <asm/unistd.h> @@ -127,9 +132,46 @@ time_t __vsyscall(1) vtime(time_t *t) return __xtime.tv_sec; } -long __vsyscall(2) venosys_0(void) +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - return -ENOSYS; + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->t0 == (j = __jiffies)) { + p = tcache->t1; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->t0 = j; + tcache->t1 = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; } long __vsyscall(3) venosys_1(void) @@ -200,6 +242,43 @@ static ctl_table kernel_root_table2[] = { #endif +static void __cpuinit write_rdtscp_cb(void *info) +{ + write_rdtscp_aux((unsigned long)info); +} + +void __cpuinit vsyscall_set_cpu(int cpu) +{ + unsigned long *d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node[cpu]; +#endif + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + void *info = (void *)((node << 12) | cpu); + /* Can happen on preemptive kernel */ + if (get_cpu() == cpu) + write_rdtscp_cb(info); +#ifdef CONFIG_SMP + else { + /* the notifier is unfortunately not executed on the + target CPU */ + smp_call_function_single(cpu,write_rdtscp_cb,info,0,1); + } +#endif + put_cpu(); + } + + /* Store cpu number in limit so that it can be loaded quickly + in user space in vgetcpu. + 12 bits for the CPU and 8 bits for the node. */ + d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + *d = 0x0f40000000000ULL; + *d |= cpu; + *d |= (node & 0xf) << 12; + *d |= (node >> 4) << 48; +} + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -214,6 +293,7 @@ static int __init vsyscall_init(void) VSYSCALL_ADDR(__NR_vgettimeofday))); BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); diff --git a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h index d4bed33fb32c..334ddcdd8f92 100644 --- a/include/asm-x86_64/segment.h +++ b/include/asm-x86_64/segment.h @@ -20,15 +20,16 @@ #define __USER_CS 0x33 /* 6*8+3 */ #define __USER32_DS __USER_DS -#define GDT_ENTRY_TLS 1 #define GDT_ENTRY_TSS 8 /* needs two entries */ #define GDT_ENTRY_LDT 10 /* needs two entries */ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 -/* 15 free */ #define GDT_ENTRY_TLS_ENTRIES 3 +#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) + /* TLS indexes for 64bit - hardcoded in arch_prctl */ #define FS_TLS 0 #define GS_TLS 1 diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 6805e1feb300..d61547fd833b 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -133,13 +133,19 @@ static __inline int logical_smp_processor_id(void) /* we don't want to mark this access volatile - bad code generation */ return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); } -#endif #ifdef CONFIG_SMP #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] #else #define cpu_physical_id(cpu) boot_cpu_id -#endif - +static inline int smp_call_function_single(int cpuid, void (*func) (void *info), + void *info, int retry, int wait) +{ + /* Disable interrupts here? */ + func(info); + return 0; +} +#endif /* !CONFIG_SMP */ +#endif /* !__ASSEMBLY */ #endif diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h index 146b24402a5f..2281e9399b96 100644 --- a/include/asm-x86_64/vsyscall.h +++ b/include/asm-x86_64/vsyscall.h @@ -4,6 +4,7 @@ enum vsyscall_num { __NR_vgettimeofday, __NR_vtime, + __NR_vgetcpu, }; #define VSYSCALL_START (-10UL << 20) @@ -15,6 +16,7 @@ enum vsyscall_num { #include <linux/seqlock.h> #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16))) +#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) #define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16))) #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) #define __section_sys_tz __attribute__ ((unused, __section__ (".sys_tz"), aligned(16))) @@ -26,6 +28,9 @@ enum vsyscall_num { #define VXTIME_HPET 2 #define VXTIME_PMTMR 3 +#define VGETCPU_RDTSCP 1 +#define VGETCPU_LSL 2 + struct vxtime_data { long hpet_address; /* HPET base address */ int last; @@ -40,6 +45,7 @@ struct vxtime_data { /* vsyscall space (readonly) */ extern struct vxtime_data __vxtime; +extern int __vgetcpu_mode; extern struct timespec __xtime; extern volatile unsigned long __jiffies; extern unsigned long __wall_jiffies; @@ -48,6 +54,7 @@ extern seqlock_t __xtime_lock; /* kernel space (writeable) */ extern struct vxtime_data vxtime; +extern int vgetcpu_mode; extern unsigned long wall_jiffies; extern struct timezone sys_tz; extern int sysctl_vsyscall; @@ -55,6 +62,8 @@ extern seqlock_t xtime_lock; extern int sysctl_vsyscall; +extern void vsyscall_set_cpu(int cpu); + #define ARCH_HAVE_XTIME_LOCK 1 #endif /* __KERNEL__ */ diff --git a/include/linux/getcpu.h b/include/linux/getcpu.h new file mode 100644 index 000000000000..031ed3780e45 --- /dev/null +++ b/include/linux/getcpu.h @@ -0,0 +1,16 @@ +#ifndef _LINUX_GETCPU_H +#define _LINUX_GETCPU_H 1 + +/* Cache for getcpu() to speed it up. Results might be upto a jiffie + out of date, but will be faster. + User programs should not refer to the contents of this structure. + It is only a cache for vgetcpu(). It might change in future kernels. + The user program must store this information per thread (__thread) + If you want 100% accurate information pass NULL instead. */ +struct getcpu_cache { + unsigned long t0; + unsigned long t1; + unsigned long res[4]; +}; + +#endif -- cgit v1.2.3 From 3cfc348bf90ffaa777c188652aa297f04eb94de8 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] x86: Add portable getcpu call For NUMA optimization and some other algorithms it is useful to have a fast to get the current CPU and node numbers in user space. x86-64 added a fast way to do this in a vsyscall. This adds a generic syscall for other architectures to make it a generic portable facility. I expect some of them will also implement it as a faster vsyscall. The cache is an optimization for the x86-64 vsyscall optimization. Since what the syscall returns is an approximation anyways and user space often wants very fast results it can be cached for some time. The norma methods to get this information in user space are relatively slow The vsyscall is in a better position to manage the cache because it has direct access to a fast time stamp (jiffies). For the generic syscall optimization it doesn't help much, but enforce a valid argument to keep programs portable I only added an i386 syscall entry for now. Other architectures can follow as needed. AK: Also added some cleanups from Andrew Morton Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/syscall_table.S | 1 + arch/x86_64/ia32/ia32entry.S | 1 + include/asm-i386/unistd.h | 3 ++- include/linux/syscalls.h | 2 ++ kernel/sys.c | 31 +++++++++++++++++++++++++++++++ 5 files changed, 37 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d4775398..7e639f78b0b9 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,4 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_getcpu diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 30ed0f6f4a2b..32fd32bea07c 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -713,4 +713,5 @@ ia32_sys_call_table: .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_getcpu ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fc1c8ddae149..565d0897b205 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -323,10 +323,11 @@ #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages 317 +#define __NR_getcpu 318 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 319 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 008f04c56737..3f0f716225ec 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -53,6 +53,7 @@ struct mq_attr; struct compat_stat; struct compat_timeval; struct robust_list_head; +struct getcpu_cache; #include <linux/types.h> #include <linux/aio_abi.h> @@ -596,5 +597,6 @@ asmlinkage long sys_get_robust_list(int pid, size_t __user *len_ptr); asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +asmlinkage long sys_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *cache); #endif diff --git a/kernel/sys.c b/kernel/sys.c index e236f98f7ec5..3f894775488d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -28,6 +28,7 @@ #include <linux/tty.h> #include <linux/signal.h> #include <linux/cn_proc.h> +#include <linux/getcpu.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -2062,3 +2063,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, } return error; } + +asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, + struct getcpu_cache __user *cache) +{ + int err = 0; + int cpu = raw_smp_processor_id(); + if (cpup) + err |= put_user(cpu, cpup); + if (nodep) + err |= put_user(cpu_to_node(cpu), nodep); + if (cache) { + /* + * The cache is not needed for this implementation, + * but make sure user programs pass something + * valid. vsyscall implementations can instead make + * good use of the cache. Only use t0 and t1 because + * these are available in both 32bit and 64bit ABI (no + * need for a compat_getcpu). 32bit has enough + * padding + */ + unsigned long t0, t1; + get_user(t0, &cache->t0); + get_user(t1, &cache->t1); + t0++; + t1++; + put_user(t0, &cache->t0); + put_user(t1, &cache->t1); + } + return err ? -EFAULT : 0; +} -- cgit v1.2.3 From 2f766d16062d0147edff91be15de4a950667ca42 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Clean up asm/smp.h includes No need to include it from entry.S Drop all the #ifdef __ASSEMBLY__ Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/entry.S | 2 -- include/asm-x86_64/smp.h | 12 ------------ 2 files changed, 14 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index aa8d8939abc1..7eb1678e098d 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -27,10 +27,8 @@ * - schedule it carefully for the final hardware. */ -#define ASSEMBLY 1 #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/smp.h> #include <asm/cache.h> #include <asm/errno.h> #include <asm/dwarf2.h> diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index d61547fd833b..612d208961a2 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -4,15 +4,12 @@ /* * We need the APIC definitions automatically as part of 'smp.h' */ -#ifndef __ASSEMBLY__ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/bitops.h> extern int disable_apic; -#endif #ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ #include <asm/fixmap.h> #include <asm/mpspec.h> #ifdef CONFIG_X86_IO_APIC @@ -21,10 +18,8 @@ extern int disable_apic; #include <asm/apic.h> #include <asm/thread_info.h> #endif -#endif #ifdef CONFIG_SMP -#ifndef ASSEMBLY #include <asm/pda.h> @@ -83,13 +78,10 @@ extern void prefill_possible_map(void); extern unsigned num_processors; extern unsigned disabled_cpus; -#endif /* !ASSEMBLY */ - #define NO_PROC_ID 0xFF /* No processor magic marker */ #endif -#ifndef ASSEMBLY /* * Some lowlevel functions might want to know about * the real APIC ID <-> CPU # mapping. @@ -111,8 +103,6 @@ static inline int cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -#endif /* !ASSEMBLY */ - #ifndef CONFIG_SMP #define stack_smp_processor_id() 0 #define safe_smp_processor_id() 0 @@ -127,7 +117,6 @@ static inline int cpu_present_to_apicid(int mps_cpu) }) #endif -#ifndef __ASSEMBLY__ static __inline int logical_smp_processor_id(void) { /* we don't want to mark this access volatile - bad code generation */ @@ -146,6 +135,5 @@ static inline int smp_call_function_single(int cpuid, void (*func) (void *info), return 0; } #endif /* !CONFIG_SMP */ -#endif /* !__ASSEMBLY */ #endif -- cgit v1.2.3 From d3a4f48d4866b8623ca9adde8ce4e5fde979c132 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@hpl.hp.com> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] x86-64 TIF flags for debug regs and io bitmap in ctxsw Hello, Following my discussion with Andi. Here is a patch that introduces two new TIF flags to simplify the context switch code in __switch_to(). The idea is to minimize the number of cache lines accessed in the common case, i.e., when neither the debug registers nor the I/O bitmap are used. This patch covers the x86-64 modifications. A patch for i386 follows. Changelog: - add TIF_DEBUG to track when debug registers are active - add TIF_IO_BITMAP to track when I/O bitmap is used - modify __switch_to() to use the new TIF flags <signed-off-by>: eranian@hpl.hp.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ptrace32.c | 4 +++ arch/x86_64/kernel/ioport.c | 1 + arch/x86_64/kernel/process.c | 73 +++++++++++++++++++++++----------------- arch/x86_64/kernel/ptrace.c | 8 +++-- include/asm-x86_64/thread_info.h | 7 ++++ 5 files changed, 60 insertions(+), 33 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 659c0722f6b8..72bf92a9d375 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -117,6 +117,10 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val) if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) return -EIO; child->thread.debugreg7 = val; + if (val) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); break; default: diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index b81614970ecc..fe063d3cfe42 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); } /* diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index bb6745d13b8f..6e0527635b4c 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -350,6 +350,7 @@ void exit_thread(void) kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ @@ -369,6 +370,7 @@ void flush_thread(void) if (t->flags & _TIF_IA32) current_thread_info()->status |= TS_COMPAT; } + t->flags &= ~_TIF_DEBUG; tsk->thread.debugreg0 = 0; tsk->thread.debugreg1 = 0; @@ -461,7 +463,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; @@ -469,6 +471,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, } memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* @@ -498,6 +501,40 @@ out: */ #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +static inline void __switch_to_xtra(struct task_struct *prev_p, + struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread, + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } +} + /* * switch_to(x,y) should switch tasks from x to y. * @@ -586,37 +623,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); /* - * Now maybe reload the debug registers + * Now maybe reload the debug registers and handle I/O bitmaps */ - if (unlikely(next->debugreg7)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - - /* - * Handle the IO bitmap - */ - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { - if (next->io_bitmap_ptr) - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - else { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } - } + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + __switch_to_xtra(prev_p, next_p, tss); return prev_p; } diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 2d50024c9f30..d35ec1bc696a 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -420,9 +420,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) break; if (i == 4) { - child->thread.debugreg7 = data; + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); ret = 0; - } + } break; } break; diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h index 790c512a4369..787a08114b48 100644 --- a/include/asm-x86_64/thread_info.h +++ b/include/asm-x86_64/thread_info.h @@ -120,6 +120,8 @@ static inline struct thread_info *stack_thread_info(void) #define TIF_FORK 18 /* ret_from_fork */ #define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 +#define TIF_DEBUG 21 /* uses debug registers */ +#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) @@ -133,6 +135,8 @@ static inline struct thread_info *stack_thread_info(void) #define _TIF_IA32 (1<<TIF_IA32) #define _TIF_FORK (1<<TIF_FORK) #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING) +#define _TIF_DEBUG (1<<TIF_DEBUG) +#define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -140,6 +144,9 @@ static inline struct thread_info *stack_thread_info(void) /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) +/* flags to check in __switch_to() */ +#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP) + #define PREEMPT_ACTIVE 0x10000000 /* -- cgit v1.2.3 From d5d9ca6d882f7c8d47ef91a701fc042cbebbc334 Mon Sep 17 00:00:00 2001 From: Adam Henley <adamazing@gmail.com> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] A few trivial spelling and grammar fixes A few trivial spelling and grammar mistakes picked up in "arch/x86_64/aperture.c", "arch/x86_64/crash.c" and "arch/x86_64/apic.c". I think all are correct fixes but am ever aware of my fallibility :o) This is my first patch submission so all feedback is appreciated, esp. WRT CCing to Linus, Andi and trivial@kernel.org, is this correct? And which is the most appropriate kernel version to diff against? If any. Should apply cleanly to 2.6.18-rc1 Signed-off-by: Adam Henley <adamazing@gmail.com> Signed-off-by: Andi Kleen <ak@suse.de> - adam --- arch/x86_64/kernel/aperture.c | 2 +- arch/x86_64/kernel/apic.c | 4 ++-- arch/x86_64/kernel/crash.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 58af8e73738b..04dbf1662523 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -48,7 +48,7 @@ static u32 __init allocate_aperture(void) /* * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chances to find a place in the lower 4GB of memory. + * have much chance of finding a place in the lower 4GB of memory. * Unfortunately we cannot move it up because that would make the * IOMMU useless. */ diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 692e9b579743..63d9b037afc6 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -400,7 +400,7 @@ void __cpuinit setup_local_APIC (void) value |= APIC_SPIV_APIC_ENABLED; /* - * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * Some unknown Intel IO/APIC (or APIC) errata are biting us with * certain networking cards. If high frequency interrupts are * happening on a particular IOAPIC pin, plus the IOAPIC routing * entry is masked/unmasked at a high rate as well then sooner or @@ -950,7 +950,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs) * We take the 'long' return path, and there every subsystem * grabs the appropriate locks (kernel lock/ irq lock). * - * we might want to decouple profiling from the 'long path', + * We might want to decouple profiling from the 'long path', * and do the profiling totally in assembly. * * Currently this isn't too much of an issue (performance wise), diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index fc57a139123e..7d7a9e75f70c 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -69,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu) * for the data I pass, and I need tags * on the data to indicate what information I have * squirrelled away. ELF notes happen to provide - * all of that that no need to invent something new. + * all of that, no need to invent something new. */ buf = (u32*)per_cpu_ptr(crash_notes, cpu); -- cgit v1.2.3 From c16b63e09d9d03158e0a92e961234e94c4862620 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] i386/x86-64: Don't randomize stack top when no randomization personality is set Based on patch from Frank van Maarseveen <frankvm@frankvm.com>, but extended. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/process.c | 3 ++- arch/x86_64/kernel/process.c | 2 +- fs/binfmt_elf.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 8657c739656a..b741c3e1a5eb 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -37,6 +37,7 @@ #include <linux/kallsyms.h> #include <linux/ptrace.h> #include <linux/random.h> +#include <linux/personality.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -905,7 +906,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 6e0527635b4c..6fbd19564e4e 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -845,7 +845,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 672a3b90bc55..5109dbff93bf 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -515,7 +515,8 @@ static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned int random_variable = 0; - if (current->flags & PF_RANDOMIZE) { + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { random_variable = get_random_int() & STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } -- cgit v1.2.3 From 31679f38d88696ed032d59e457f1605c97e7d719 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Simplify profile_pc on x86-64 Use knowledge about EFLAGS layout (bits 22:63 are always 0) to distingush EFLAGS word and kernel address in the spin lock stack frame. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/time.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 560ed944dc0e..ea00915d393a 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -189,20 +189,15 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - /* Assume the lock function has either no stack frame or only a single - word. This checks if the address on the stack looks like a kernel - text address. - There is a small window for false hits, but in that case the tick - is just accounted to the spinlock function. - Better would be to write these functions in assembler again - and check exactly. */ + /* Assume the lock function has either no stack frame or a copy + of eflags from PUSHF + Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { - char *v = *(char **)regs->rsp; - if ((v >= _stext && v <= _etext) || - (v >= _sinittext && v <= _einittext) || - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) - return (unsigned long)v; - return ((unsigned long *)regs->rsp)[1]; + unsigned long *sp = (unsigned long *)regs->rsp; + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; } return pc; } -- cgit v1.2.3 From 9a0b26e6bc4ae1979d9bcc6194e57a71b2b5cac6 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:28 +0200 Subject: [PATCH] Clean up read write lock assembly - Move the slow path fallbacks to their own assembly files This makes them much easier to read and is needed for the next change. - Add CFI annotations for unwinding (XXX need review) - Remove constant case which can never happen with out of line spinlocks - Use patchable LOCK prefixes - Don't use lock sections anymore for inline code because they can't be expressed by the unwinder (this adds one taken jump to the lock fast path) Cc: jbeulich@novell.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/lib/Makefile | 2 +- arch/x86_64/lib/rwlock.S | 38 ++++++++++++++++++++++++ arch/x86_64/lib/thunk.S | 30 ------------------- include/asm-x86_64/rwlock.h | 68 +++++++------------------------------------ include/asm-x86_64/spinlock.h | 11 ++----- 5 files changed, 51 insertions(+), 98 deletions(-) create mode 100644 arch/x86_64/lib/rwlock.S (limited to 'arch/x86_64') diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index ccef6ae747a3..b78d4170fce2 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o +lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S new file mode 100644 index 000000000000..0cde1f807314 --- /dev/null +++ b/arch/x86_64/lib/rwlock.S @@ -0,0 +1,38 @@ +/* Slow paths of read/write spinlocks. */ + +#include <linux/linkage.h> +#include <asm/rwlock.h> +#include <asm/alternative-asm.i> +#include <asm/dwarf2.h> + +/* rdi: pointer to rwlock_t */ +ENTRY(__write_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + addl $RW_LOCK_BIAS,(%rdi) +1: rep + nop + cmpl $RW_LOCK_BIAS,(%rdi) + jne 1b + LOCK_PREFIX + subl $RW_LOCK_BIAS,(%rdi) + jnz __write_lock_failed + ret + CFI_ENDPROC +END(__write_lock_failed) + +/* rdi: pointer to rwlock_t */ +ENTRY(__read_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + incl (%rdi) +1: rep + nop + cmpl $1,(%rdi) + js 1b + LOCK_PREFIX + decl (%rdi) + js __read_lock_failed + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 332ea5dff916..6cff27c775ae 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -67,33 +67,3 @@ restore_norax: RESTORE_ARGS 1 ret CFI_ENDPROC - -#ifdef CONFIG_SMP -/* Support for read/write spinlocks. */ - .text -/* rax: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - lock - addl $RW_LOCK_BIAS,(%rax) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rax) - jne 1b - lock - subl $RW_LOCK_BIAS,(%rax) - jnz __write_lock_failed - ret - -/* rax: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - lock - incl (%rax) -1: rep - nop - cmpl $1,(%rax) - js 1b - lock - decl (%rax) - js __read_lock_failed - ret -#endif diff --git a/include/asm-x86_64/rwlock.h b/include/asm-x86_64/rwlock.h index dea0e9459264..28a080d23119 100644 --- a/include/asm-x86_64/rwlock.h +++ b/include/asm-x86_64/rwlock.h @@ -18,69 +18,21 @@ #ifndef _ASM_X86_64_RWLOCK_H #define _ASM_X86_64_RWLOCK_H -#include <linux/stringify.h> - #define RW_LOCK_BIAS 0x01000000 -#define RW_LOCK_BIAS_STR "0x01000000" +#define RW_LOCK_BIAS_STR "0x01000000" -#define __build_read_lock_ptr(rw, helper) \ +#define __build_read_lock(rw) \ asm volatile(LOCK_PREFIX "subl $1,(%0)\n\t" \ - "js 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_read_lock_const(rw, helper) \ - asm volatile(LOCK_PREFIX "subl $1,%0\n\t" \ - "js 2f\n" \ + "jns 1f\n" \ + "call __read_lock_failed\n" \ "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushq %%rax\n\t" \ - "leaq %0,%%rax\n\t" \ - "call " helper "\n\t" \ - "popq %%rax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*((volatile int *)rw))::"memory") - -#define __build_read_lock(rw, helper) do { \ - if (__builtin_constant_p(rw)) \ - __build_read_lock_const(rw, helper); \ - else \ - __build_read_lock_ptr(rw, helper); \ - } while (0) + ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory") -#define __build_write_lock_ptr(rw, helper) \ - asm volatile(LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%0)\n\t" \ - "jnz 2f\n" \ +#define __build_write_lock(rw) \ + asm volatile(LOCK_PREFIX "subl %1,(%0)\n\t" \ + "jz 1f\n" \ + "\tcall __write_lock_failed\n\t" \ "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tcall " helper "\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - ::"a" (rw) : "memory") - -#define __build_write_lock_const(rw, helper) \ - asm volatile(LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",%0\n\t" \ - "jnz 2f\n" \ - "1:\n" \ - LOCK_SECTION_START("") \ - "2:\tpushq %%rax\n\t" \ - "leaq %0,%%rax\n\t" \ - "call " helper "\n\t" \ - "popq %%rax\n\t" \ - "jmp 1b\n" \ - LOCK_SECTION_END \ - :"=m" (*((volatile long *)rw))::"memory") - -#define __build_write_lock(rw, helper) do { \ - if (__builtin_constant_p(rw)) \ - __build_write_lock_const(rw, helper); \ - else \ - __build_write_lock_ptr(rw, helper); \ - } while (0) + ::"D" (rw), "i" (RW_LOCK_BIAS) : "memory") #endif diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h index 248a79f0eaff..a8e3d89f591f 100644 --- a/include/asm-x86_64/spinlock.h +++ b/include/asm-x86_64/spinlock.h @@ -79,13 +79,6 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) * * On x86, we implement read-write locks as a 32-bit counter * with the high bit (sign) being the "contended" bit. - * - * The inline assembly is non-obvious. Think about it. - * - * Changed to use the same technique as rw semaphores. See - * semaphore.h for details. -ben - * - * the helpers are in arch/i386/kernel/semaphore.c */ #define __raw_read_can_lock(x) ((int)(x)->lock > 0) @@ -93,12 +86,12 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock) static inline void __raw_read_lock(raw_rwlock_t *rw) { - __build_read_lock(rw, "__read_lock_failed"); + __build_read_lock(rw); } static inline void __raw_write_lock(raw_rwlock_t *rw) { - __build_write_lock(rw, "__write_lock_failed"); + __build_write_lock(rw); } static inline int __raw_read_trylock(raw_rwlock_t *lock) -- cgit v1.2.3 From b06babac45e1546dfb504f1f25eb0495632bfc41 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add proper alignment to ENTRY Previously it didn't align. Use the same one as the C compiler in blended mode, which is good for K8 and Core2 and doesn't hurt on P4. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/entry.S | 3 +-- include/asm-x86_64/linkage.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 7eb1678e098d..2dc5c01f754d 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -617,8 +617,7 @@ retint_signal: #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ - .p2align -retint_kernel: +ENTRY(retint_kernel) cmpl $0,threadinfo_preempt_count(%rcx) jnz retint_restore_args bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) diff --git a/include/asm-x86_64/linkage.h b/include/asm-x86_64/linkage.h index 291c2d01c44f..b5f39d0189ce 100644 --- a/include/asm-x86_64/linkage.h +++ b/include/asm-x86_64/linkage.h @@ -1,6 +1,6 @@ #ifndef __ASM_LINKAGE_H #define __ASM_LINKAGE_H -/* Nothing to see here... */ +#define __ALIGN .p2align 4,,15 #endif -- cgit v1.2.3 From cc1e684a9f1cc069b2ac249aa11486613369dd69 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove leftover CVS Id in thunk.S And move the comment to a proper place. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/lib/thunk.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 6cff27c775ae..0025535cac8d 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -1,10 +1,9 @@ - /* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Subject to the GNU public license, v.2. No warranty of any kind. - * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $ - */ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ #include <linux/config.h> #include <linux/linkage.h> -- cgit v1.2.3 From 5ba5891d44a6acade44887a0f3195489d46c12de Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add some comments what tce.c actually does Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/tce.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 5530dda3f27a..0905ce6b9b32 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -1,4 +1,6 @@ /* + * This file manages the translation entries for the IBM Calgary IOMMU. + * * Derived from arch/powerpc/platforms/pseries/iommu.c * * Copyright (C) IBM Corporation, 2006 -- cgit v1.2.3 From 7f11d8a5efd625ffa41cde1d8472f85e885478ec Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove all ifdefs for local/io apic IO-APIC or local APIC can only be disabled at runtime anyways and Kconfig has forced these options on for a long time now. The Kconfigs are kept only now for the benefit of the shared acpi boot.c code. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/Makefile | 4 ++-- arch/x86_64/kernel/apic.c | 6 +----- arch/x86_64/kernel/crash.c | 2 -- arch/x86_64/kernel/entry.S | 2 -- arch/x86_64/kernel/head64.c | 2 -- arch/x86_64/kernel/i8259.c | 9 --------- arch/x86_64/kernel/irq.c | 6 ------ arch/x86_64/kernel/mpparse.c | 9 --------- arch/x86_64/kernel/setup.c | 6 ------ arch/x86_64/kernel/smpboot.c | 5 ----- arch/x86_64/kernel/time.c | 8 -------- arch/x86_64/kernel/traps.c | 2 -- include/asm-x86_64/apic.h | 4 ---- include/asm-x86_64/fixmap.h | 4 ---- include/asm-x86_64/io_apic.h | 6 ------ include/asm-x86_64/irq.h | 2 -- include/asm-x86_64/mpspec.h | 2 -- include/asm-x86_64/smp.h | 4 ---- 18 files changed, 3 insertions(+), 80 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index b5aaeafc1cd3..0ef39553c0f6 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -20,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ +obj-y += apic.o nmi.o +obj-y += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 63d9b037afc6..c6ca100a7dea 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -661,7 +661,6 @@ void __init init_apic_mappings(void) */ boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); -#ifdef CONFIG_X86_IO_APIC { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; @@ -679,7 +678,6 @@ void __init init_apic_mappings(void) idx++; } } -#endif } /* @@ -1129,12 +1127,10 @@ int __init APIC_init_uniprocessor (void) setup_local_APIC(); -#ifdef CONFIG_X86_IO_APIC if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + setup_IO_APIC(); else nr_ioapics = 0; -#endif setup_boot_APIC_clock(); check_nmi_watchdog(); return 0; diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 7d7a9e75f70c..3525f884af82 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -194,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs) if(cpu_has_apic) disable_local_APIC(); -#if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); -#endif crash_save_self(regs); } diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2dc5c01f754d..a0cf36ba7c56 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -676,7 +676,6 @@ ENTRY(call_function_interrupt) END(call_function_interrupt) #endif -#ifdef CONFIG_X86_LOCAL_APIC ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt END(apic_timer_interrupt) @@ -688,7 +687,6 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) -#endif /* * Exception entry points. diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 36647ce6aecb..bacbd75c63ad 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -111,10 +111,8 @@ void __init x86_64_start_kernel(char * real_mode_data) if (s != NULL) numa_setup(s+5); #endif -#ifdef CONFIG_X86_IO_APIC if (strstr(saved_command_line, "disableapic")) disable_apic = 1; -#endif /* You need early console to see that */ if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) panic("Kernel too big for kernel mapping\n"); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 0434b1f8e3dd..d2f5acde5749 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -55,7 +55,6 @@ */ BUILD_16_IRQS(0x0) -#ifdef CONFIG_X86_LOCAL_APIC /* * The IO-APIC gives us many more interrupt sources. Most of these * are unused but an SMP system is supposed to have enough memory ... @@ -75,8 +74,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_15_IRQS(0xe) #endif -#endif - #undef BUILD_16_IRQS #undef BUILD_15_IRQS #undef BI @@ -100,7 +97,6 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) void (*interrupt[NR_IRQS])(void) = { IRQLIST_16(0x0), -#ifdef CONFIG_X86_IO_APIC IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), @@ -110,7 +106,6 @@ void (*interrupt[NR_IRQS])(void) = { , IRQLIST_15(0xe) #endif -#endif }; #undef IRQ @@ -453,9 +448,7 @@ void __init init_ISA_irqs (void) { int i; -#ifdef CONFIG_X86_LOCAL_APIC init_bsp_APIC(); -#endif init_8259A(0); for (i = 0; i < NR_IRQS; i++) { @@ -581,14 +574,12 @@ void __init init_IRQ(void) set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif /* * Set the clock to HZ Hz, we already have a valid diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 5221a53e90c1..bfbcf92cfa4a 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -20,11 +20,9 @@ #include <asm/idle.h> atomic_t irq_err_count; -#ifdef CONFIG_X86_IO_APIC #ifdef APIC_MISMATCH_DEBUG atomic_t irq_mis_count; #endif -#endif #ifdef CONFIG_DEBUG_STACKOVERFLOW /* @@ -92,17 +90,13 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); seq_putc(p, '\n'); -#ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); seq_putc(p, '\n'); -#endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC #ifdef APIC_MISMATCH_DEBUG seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif #endif } return 0; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index a1ab4197f8a1..c73dd1f41e65 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -74,14 +74,10 @@ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; /* ACPI MADT entry parsing functions */ #ifdef CONFIG_ACPI extern struct acpi_boot_flags acpi_boot; -#ifdef CONFIG_X86_LOCAL_APIC extern int acpi_parse_lapic (acpi_table_entry_header *header); extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -#endif /*CONFIG_X86_LOCAL_APIC*/ -#ifdef CONFIG_X86_IO_APIC extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -661,9 +657,7 @@ void __init find_intel_smp (void) */ void __init find_smp_config (void) { -#ifdef CONFIG_X86_LOCAL_APIC find_intel_smp(); -#endif } @@ -717,8 +711,6 @@ void __cpuinit mp_register_lapic ( MP_processor_info(&processor); } -#ifdef CONFIG_X86_IO_APIC - #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 @@ -997,5 +989,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 34afad704824..fbe9f7faa2db 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -340,10 +340,8 @@ static __init void parse_cmdline_early (char ** cmdline_p) else if (fullarg(from, "acpi=strict")) { acpi_strict = 1; } -#ifdef CONFIG_X86_IO_APIC else if (fullarg(from, "acpi_skip_timer_override")) acpi_skip_timer_override = 1; -#endif #endif if (fullarg(from, "disable_timer_pin_1")) @@ -625,12 +623,10 @@ void __init setup_arch(char **cmdline_p) */ acpi_reserve_bootmem(); #endif -#ifdef CONFIG_X86_LOCAL_APIC /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); -#endif #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { @@ -674,14 +670,12 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); -#ifdef CONFIG_X86_LOCAL_APIC /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); init_apic_mappings(); -#endif /* * Request address space for all standard RAM and ROM resources diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 582896f7d420..d29571e24953 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1175,13 +1175,8 @@ int __cpuinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { smp_cleanup_boot(); - -#ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -#endif - check_nmi_watchdog(); - time_init_gtod(); } diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index ea00915d393a..d66c7f750e75 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -41,9 +41,7 @@ #include <asm/sections.h> #include <linux/cpufreq.h> #include <linux/hpet.h> -#ifdef CONFIG_X86_LOCAL_APIC #include <asm/apic.h> -#endif #ifdef CONFIG_CPU_FREQ static void cpufreq_delayed_get(void); @@ -438,12 +436,8 @@ void main_timer_handler(struct pt_regs *regs) * have to call the local interrupt handler. */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else if (!using_apic_timer) smp_local_timer_interrupt(regs); -#endif /* * If we have an externally synchronized Linux clock, then update CMOS clock @@ -467,10 +461,8 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) if (apic_runs_main_timer > 1) return IRQ_HANDLED; main_timer_handler(regs); -#ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(); -#endif return IRQ_HANDLED; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 96f62a033242..56d7ff0c894c 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -788,7 +788,6 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; -#ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -796,7 +795,6 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs,reason)) return; if (!do_nmi_callback(regs,cpu)) -#endif unknown_nmi_error(reason, regs); return; diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 9c96a0a8d1bd..8ed0f4d67b8d 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -29,8 +29,6 @@ extern int apic_runs_main_timer; printk(s, ##a); \ } while (0) -#ifdef CONFIG_X86_LOCAL_APIC - struct pt_regs; /* @@ -104,8 +102,6 @@ void switch_ipi_to_APIC_timer(void *cpumask); #define ARCH_APICTIMER_STOPS_ON_C3 1 -#endif /* CONFIG_X86_LOCAL_APIC */ - extern unsigned boot_cpu_id; #endif /* __ASM_APIC_H */ diff --git a/include/asm-x86_64/fixmap.h b/include/asm-x86_64/fixmap.h index 0b4ffbd1a125..1b620db5b9e3 100644 --- a/include/asm-x86_64/fixmap.h +++ b/include/asm-x86_64/fixmap.h @@ -37,13 +37,9 @@ enum fixed_addresses { VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, VSYSCALL_HPET, FIX_HPET_BASE, -#ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ -#endif -#ifdef CONFIG_X86_IO_APIC FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, -#endif __end_of_fixed_addresses }; diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h index fb7a0909a174..5d1b5c68e36e 100644 --- a/include/asm-x86_64/io_apic.h +++ b/include/asm-x86_64/io_apic.h @@ -10,8 +10,6 @@ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ -#ifdef CONFIG_X86_IO_APIC - #ifdef CONFIG_PCI_MSI static inline int use_pci_vector(void) {return 1;} static inline void disable_edge_ioapic_vector(unsigned int vector) { } @@ -209,10 +207,6 @@ extern int timer_uses_ioapic_pin_0; extern int sis_apic_bug; /* dummy */ -#else /* !CONFIG_X86_IO_APIC */ -#define io_apic_assign_pci_irqs 0 -#endif - extern int assign_irq_vector(int irq); void enable_NMI_through_LVT0 (void * dummy); diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h index 9db5a1b4f7b1..43469d8ab71a 100644 --- a/include/asm-x86_64/irq.h +++ b/include/asm-x86_64/irq.h @@ -44,9 +44,7 @@ static __inline__ int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC #define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */ -#endif #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h index 14fc3ddd9031..3c7aae9fd157 100644 --- a/include/asm-x86_64/mpspec.h +++ b/include/asm-x86_64/mpspec.h @@ -184,12 +184,10 @@ extern int pic_mode; extern void mp_register_lapic (u8 id, u8 enabled); extern void mp_register_lapic_address (u64 address); -#ifdef CONFIG_X86_IO_APIC extern void mp_register_ioapic (u8 id, u32 address, u32 gsi_base); extern void mp_override_legacy_irq (u8 bus_irq, u8 polarity, u8 trigger, u32 gsi); extern void mp_config_acpi_legacy_irqs (void); extern int mp_register_gsi (u32 gsi, int triggering, int polarity); -#endif /*CONFIG_X86_IO_APIC*/ #endif extern int using_apic_timer; diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 612d208961a2..6353fa41bebe 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -9,15 +9,11 @@ #include <linux/bitops.h> extern int disable_apic; -#ifdef CONFIG_X86_LOCAL_APIC #include <asm/fixmap.h> #include <asm/mpspec.h> -#ifdef CONFIG_X86_IO_APIC #include <asm/io_apic.h> -#endif #include <asm/apic.h> #include <asm/thread_info.h> -#endif #ifdef CONFIG_SMP -- cgit v1.2.3 From 6c96a29f20762e8ce40b674f906055d009c302ee Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove apic mismatch counter Nobody has been setting the mismatch counter and the ifdef was never set so remove it. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/irq.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index bfbcf92cfa4a..b3677e6ccc6e 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -20,9 +20,6 @@ #include <asm/idle.h> atomic_t irq_err_count; -#ifdef APIC_MISMATCH_DEBUG -atomic_t irq_mis_count; -#endif #ifdef CONFIG_DEBUG_STACKOVERFLOW /* @@ -95,9 +92,6 @@ skip: seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); seq_putc(p, '\n'); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef APIC_MISMATCH_DEBUG - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif } return 0; } -- cgit v1.2.3 From 3f14c746a61ec932c204aca820c02c293118c5df Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove old "focus disabled" chipset errata workaround The new systems already use focus disabled and the comment was completely outdated. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index c6ca100a7dea..78c43b1a3064 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -399,32 +399,8 @@ void __cpuinit setup_local_APIC (void) */ value |= APIC_SPIV_APIC_ENABLED; - /* - * Some unknown Intel IO/APIC (or APIC) errata are biting us with - * certain networking cards. If high frequency interrupts are - * happening on a particular IOAPIC pin, plus the IOAPIC routing - * entry is masked/unmasked at a high rate as well then sooner or - * later IOAPIC line gets 'stuck', no more interrupts are received - * from the device. If focus CPU is disabled then the hang goes - * away, oh well :-( - * - * [ This bug can be reproduced easily with a level-triggered - * PCI Ne2000 networking cards and PII/PIII processors, dual - * BX chipset. ] - */ - /* - * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more - * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro - */ -#if 1 - /* Enable focus processor (bit==0) */ - value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif + /* We always use processor focus */ + /* * Set spurious IRQ vector */ -- cgit v1.2.3 From 2e91a17b35116885373e04af142b1d08cf1b47bf Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Add some comments to entry.S And remove some old obsolete ones. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/entry.S | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a0cf36ba7c56..a04fc4108ff2 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -22,9 +22,21 @@ * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. * - full stack frame: Like partial stack frame, but all register saved. - * - * TODO: - * - schedule it carefully for the final hardware. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. */ #include <linux/linkage.h> -- cgit v1.2.3 From 5cb6b99928a8f1a1bbdac7a16b88aef8cb64d432 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove pirq overwrite support This was an old workaround for broken MP-BIOS. The user could specify overwrites on the command line. I've never seen it being used for anything on 64bit. So get rid of it for now. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/io_apic.c | 55 -------------------------------------------- 1 file changed, 55 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 924a4a332954..86cc07caa0e5 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -225,14 +225,6 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; int skip_ioapic_setup; int ioapic_force; @@ -370,34 +362,6 @@ void __init check_ioapic(void) } } -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - /* * Find the IRQ entry number of a certain pin. */ @@ -793,22 +757,6 @@ static int pin_2_irq(int idx, int apic, int pin) } } BUG_ON(irq >= NR_IRQS); - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - BUG_ON(irq >= NR_IRQS); return irq; } @@ -1281,9 +1229,6 @@ static void __init enable_IO_APIC(void) irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; /* * The number of IO-APIC IRQ registers (== #pins): -- cgit v1.2.3 From e50991343488edf25d58820be1684322808c763c Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:29 +0200 Subject: [PATCH] Remove leftover MCE/EISA support No 64bit EISA or Microchannel systems ever. Remove the left over code in the IO-APIC driver and the mptable parser Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/io_apic.c | 57 ++------------------------------------------ arch/x86_64/kernel/mpparse.c | 16 +------------ 2 files changed, 3 insertions(+), 70 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 86cc07caa0e5..97e90f6abcc4 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -389,9 +389,7 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if (mp_bus_id_to_type[lbus] == MP_BUS_ISA && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -407,9 +405,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -472,27 +468,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return best_guess; } -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ @@ -505,12 +480,6 @@ static int EISA_ELCR(unsigned int irq) #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -530,21 +499,11 @@ static int __init MPBIOS_polarity(int idx) polarity = default_ISA_polarity(idx); break; } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } case MP_BUS_PCI: /* PCI pin */ { polarity = default_PCI_polarity(idx); break; } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -599,21 +558,11 @@ static int MPBIOS_trigger(int idx) trigger = default_ISA_trigger(idx); break; } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } case MP_BUS_PCI: /* PCI pin */ { trigger = default_PCI_trigger(idx); break; } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -731,8 +680,6 @@ static int pin_2_irq(int idx, int apic, int pin) switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: { irq = mp_irqs[idx].mpc_srcbusirq; break; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index c73dd1f41e65..90e99cf27aa5 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -175,14 +175,10 @@ static void __init MP_bus_info (struct mpc_config_bus *m) if (strncmp(str, "ISA", 3) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, "EISA", 4) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; } else if (strncmp(str, "PCI", 3) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; - } else if (strncmp(str, "MCA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; } else { printk(KERN_ERR "Unknown bustype %s\n", str); } @@ -465,14 +461,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) case 5: memcpy(bus.mpc_bustype, "ISA ", 6); break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { @@ -629,9 +617,7 @@ void __init find_intel_smp (void) smp_scan_config(0xF0000,0x10000)) return; /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. + * If it is an SMP machine we should know now. * * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E, calculate and scan it here. -- cgit v1.2.3 From a8fcf1a24a16e1c735c795d99773b5dcefb71518 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove obsolete PIC mode PIC mode is an outdated way to drive the APICs that was used on some early MP boards. It is not supported in the ACPI model. It is unlikely to be ever configured by any x86-64 system Remove it thus. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 92 ++++++++++++++------------------------------ arch/x86_64/kernel/mpparse.c | 8 ---- arch/x86_64/kernel/smpboot.c | 1 - include/asm-x86_64/mpspec.h | 1 - include/asm-x86_64/smp.h | 1 - 5 files changed, 29 insertions(+), 74 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 78c43b1a3064..20a76f209629 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -136,72 +136,40 @@ void clear_local_APIC(void) apic_read(APIC_ESR); } -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -} - void disconnect_bsp_APIC(int virt_wire_setup) { - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); } void disable_local_APIC(void) @@ -418,7 +386,7 @@ void __cpuinit setup_local_APIC (void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { + if (!smp_processor_id() && !value) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); } else { @@ -1096,8 +1064,6 @@ int __init APIC_init_uniprocessor (void) verify_local_APIC(); - connect_bsp_APIC(); - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 90e99cf27aa5..022624503552 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -56,7 +56,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; int mp_irq_entries; int nr_ioapics; -int pic_mode; unsigned long mp_lapic_addr = 0; @@ -514,13 +513,6 @@ void __init get_smp_config (void) printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } /* * Now see if we need to read further. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index d29571e24953..ae7d6d84e352 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1090,7 +1090,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* * Switch from PIC to APIC mode. */ - connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h index 3c7aae9fd157..1dc83436cc91 100644 --- a/include/asm-x86_64/mpspec.h +++ b/include/asm-x86_64/mpspec.h @@ -178,7 +178,6 @@ extern int mp_irq_entries; extern struct mpc_config_intsrc mp_irqs [MAX_IRQ_SOURCES]; extern int mpc_default_type; extern unsigned long mp_lapic_addr; -extern int pic_mode; #ifdef CONFIG_ACPI extern void mp_register_lapic (u8 id, u8 enabled); diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 6353fa41bebe..498fbc1fc179 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -33,7 +33,6 @@ extern cpumask_t cpu_initialized; extern void smp_alloc_memory(void); extern volatile unsigned long smp_invalidate_needed; -extern int pic_mode; extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; -- cgit v1.2.3 From c1a58b42b428e717afbbb298356e041cea54ad17 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] i386/x86-64: Remove obsolete sanity check in mptable parsing It apparently has never triggered in many years. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/mpparse.c | 13 ------------- arch/x86_64/kernel/mpparse.c | 13 ------------- 2 files changed, 26 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 827569688562..a2b126fe9a54 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -294,19 +294,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } #ifdef CONFIG_X86_NUMAQ diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 022624503552..c12acc3b5552 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -223,19 +223,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } /* -- cgit v1.2.3 From eea0e11c1f0d6ef89e64182b2f1223a4ca2b74a2 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Factor out common io apic routing entry access The IO APIC code had lots of duplicated code to read/write 64bit routing entries into the IO-APIC. Factor this out int common read/write functions In a few cases the IO APIC lock is taken more often now, but this isn't a problem because it's all initialization/shutdown only slow path code. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/io_apic.c | 82 ++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 41 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 97e90f6abcc4..1ccf1b41c3ed 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -111,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; FINAL; \ } +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + #ifdef CONFIG_SMP static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { @@ -196,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; /* @@ -210,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } static void clear_IO_APIC (void) @@ -838,9 +858,9 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -978,10 +998,7 @@ void __apicdebuginit print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1191,11 +1208,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. @@ -1247,7 +1260,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1264,12 +1276,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); @@ -1879,17 +1886,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); return 0; } @@ -1911,11 +1913,9 @@ static int ioapic_resume(struct sys_device *dev) reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2040,10 +2040,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; -- cgit v1.2.3 From 606bd58de6542e847c51b1b6d83a4cd70a632fe7 Mon Sep 17 00:00:00 2001 From: Diego Calleja <diegocg@gmail.com> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] x86: AUX_DEVICE_INFO is one byte long, use 'movb' Bugzilla #6552 says: "In arch/i386/boot/setup.S, movw is used instead of movb for PS/2 mouse information, although it is unsigned char. This does not harm, because the jmp instruction overwritten by movw is used before executing movw, and never be used again" I've no idea if this is a real bug or how it gets fixed, so I'm submitting it for review instead of letting it die of boredom in bugzilla. Aditionally to i386, I've changed x86-64, which mirrors the same code. Credits to Yoshinori K. Okuji, who found the problem and suggested a fix. Signed-off-by: Diego Calleja <diegocg@gmail.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/boot/setup.S | 4 ++-- arch/x86_64/boot/setup.S | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index d2b684cd620a..3aec4538a113 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -494,12 +494,12 @@ no_voyager: movw %cs, %ax # aka SETUPSEG subw $DELTA_INITSEG, %ax # aka INITSEG movw %ax, %ds - movw $0, (0x1ff) # default is no pointing device + movb $0, (0x1ff) # default is no pointing device int $0x11 # int 0x11: equipment list testb $0x04, %al # check if mouse installed jz no_psmouse - movw $0xAA, (0x1ff) # device present + movb $0xAA, (0x1ff) # device present no_psmouse: #if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index a50b631f4d2b..c3bfd223ab49 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -526,12 +526,12 @@ is_disk1: movw %cs, %ax # aka SETUPSEG subw $DELTA_INITSEG, %ax # aka INITSEG movw %ax, %ds - movw $0, (0x1ff) # default is no pointing device + movb $0, (0x1ff) # default is no pointing device int $0x11 # int 0x11: equipment list testb $0x04, %al # check if mouse installed jz no_psmouse - movw $0xAA, (0x1ff) # device present + movb $0xAA, (0x1ff) # device present no_psmouse: #include "../../i386/boot/edd.S" -- cgit v1.2.3 From edd965229669f8f8dfddec8c863250440fb65ab3 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove MPS table APIC renumbering The MPS table specification says that the operating system should renumber the IO-APICs following the table as needed. However in ACPI this is not allowed or neeeded and all x86-64 systems are ACPI compliant. The code was already disabled on some systems because it caused problems there. Remove it completely now. CC: mdomsch@dell.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/io_apic.c | 71 -------------------------------------------- 1 file changed, 71 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 1ccf1b41c3ed..a02ed17d05dc 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1282,72 +1282,6 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc (void) -{ - union IO_APIC_reg_00 reg_00; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE," ok.\n"); - } -} - /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1863,11 +1797,6 @@ void __init setup_IO_APIC(void) apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up the IO-APIC IRQ routing table. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); -- cgit v1.2.3 From dfa4698c50bf85b7927214b0e4a3dc4bc3b3c4a9 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Move early chipset quirks out to new file They did not really belong into io_apic.c. Move them into a new file and clean it up a bit. Also remove outdated ATI quirk that was obsolete, Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/Makefile | 2 +- arch/x86_64/kernel/early-quirks.c | 118 ++++++++++++++++++++++++++++++++++++++ arch/x86_64/kernel/io_apic.c | 101 -------------------------------- arch/x86_64/kernel/setup.c | 2 +- include/asm-x86_64/proto.h | 2 +- 5 files changed, 121 insertions(+), 104 deletions(-) create mode 100644 arch/x86_64/kernel/early-quirks.c (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 0ef39553c0f6..000e67e8f028 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o early-quirks.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c new file mode 100644 index 000000000000..d637cff1c4b1 --- /dev/null +++ b/arch/x86_64/kernel/early-quirks.c @@ -0,0 +1,118 @@ +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/pci_ids.h> +#include <asm/pci-direct.h> +#include <asm/proto.h> +#include <asm/dma.h> + +static void via_bugs(void) +{ +#ifdef CONFIG_IOMMU + if ((end_pfn > MAX_DMA32_PFN || force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); + iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + +static void nvidia_bugs(void) +{ +#ifdef CONFIG_ACPI + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + */ + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + } +#endif + /* RED-PEN skip them on mptables too? */ + +} + +static void ati_bugs(void) +{ +#if 1 /* for testing */ + printk("ATI board detected\n"); +#endif + /* No bugs right now */ +} + +struct chipset { + u16 vendor; + void (*f)(void); +}; + +static struct chipset early_qrk[] = { + { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, via_bugs }, + { PCI_VENDOR_ID_ATI, ati_bugs }, + {} +}; + +void __init early_quirks(void) +{ + int num, slot, func; + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + int i; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + + for (i = 0; early_qrk[i].f; i++) + if (early_qrk[i].vendor == vendor) { + early_qrk[i].f(); + return; + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index a02ed17d05dc..8bbff64c20d7 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -280,107 +280,6 @@ static int __init setup_enable_8254_timer(char *s) __setup("disable_8254_timer", setup_disable_8254_timer); __setup("enable_8254_timer", setup_enable_8254_timer); -#include <asm/pci-direct.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> - - -#ifdef CONFIG_ACPI - -static int nvidia_hpet_detected __initdata; - -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) -{ - nvidia_hpet_detected = 1; - return 0; -} -#endif - -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC - off. Check for an Nvidia or VIA PCI bridge and turn it off. - Use pci direct infrastructure because this runs before the PCI subsystem. - - Can be overwritten with "apic" - - And another hack to disable the IOMMU on VIA chipsets. - - ... and others. Really should move this somewhere else. - - Kludge-O-Rama. */ -void __init check_ioapic(void) -{ - int num,slot,func; - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - switch (vendor) { - case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_IOMMU - if ((end_pfn > MAX_DMA32_PFN || - force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n"); - iommu_aperture_disabled = 1; - } -#endif - return; - case PCI_VENDOR_ID_NVIDIA: -#ifdef CONFIG_ACPI - /* - * All timer overrides on Nvidia are - * wrong unless HPET is enabled. - */ - nvidia_hpet_detected = 0; - acpi_table_parse(ACPI_HPET, - nvidia_hpet_check); - if (nvidia_hpet_detected == 0) { - acpi_skip_timer_override = 1; - printk(KERN_INFO "Nvidia board " - "detected. Ignoring ACPI " - "timer override.\n"); - } -#endif - /* RED-PEN skip them on mptables too? */ - return; - - /* This should be actually default, but - for 2.6.16 let's do it for ATI only where - it's really needed. */ - case PCI_VENDOR_ID_ATI: - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } - return; - } - - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} /* * Find the IRQ entry number of a certain pin. diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index fbe9f7faa2db..aab4bd66aa0d 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -653,7 +653,7 @@ void __init setup_arch(char **cmdline_p) paging_init(); - check_ioapic(); + early_quirks(); /* * set this early, so we dont allocate cpu0 diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 3b1c60247902..58fec91318e4 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -92,7 +92,7 @@ extern void syscall32_cpu_init(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); -extern void check_ioapic(void); +extern void early_quirks(void); extern void check_efer(void); extern int unhandled_signal(struct task_struct *tsk, int sig); -- cgit v1.2.3 From 55f05ffaa788e039df2f1ebe0d7bfbcb6f39d0b4 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Replace mp bus array with bitmap for bus not pci Since we only support PCI and ISA legacy busses now there is no need to have an full array with checking. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/io_apic.c | 92 +++++++++++--------------------------------- arch/x86_64/kernel/mpparse.c | 9 ++--- include/asm-x86_64/mpspec.h | 8 +--- 3 files changed, 27 insertions(+), 82 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 8bbff64c20d7..a1412042b918 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -308,7 +308,7 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if (mp_bus_id_to_type[lbus] == MP_BUS_ISA && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -324,7 +324,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -364,7 +364,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + if (!test_bit(lbus, mp_bus_not_pci) && !mp_irqs[i].mpc_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { @@ -410,28 +410,11 @@ static int __init MPBIOS_polarity(int idx) switch (mp_irqs[idx].mpc_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); break; - } case 1: /* high active */ { polarity = 0; @@ -469,28 +452,11 @@ static int MPBIOS_trigger(int idx) switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); break; - } case 1: /* edge */ { trigger = 0; @@ -596,31 +562,17 @@ static int pin_2_irq(int idx, int apic, int pin) if (mp_irqs[idx].mpc_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - irq = gsi_irq_sharing(irq); - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mpc_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + irq = gsi_irq_sharing(irq); } BUG_ON(irq >= NR_IRQS); return irq; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index c12acc3b5552..a8d38c0bb449 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -42,7 +42,7 @@ int acpi_found_madt; * MP-table. */ unsigned char apic_version [MAX_APICS]; -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; static int mp_current_pci_id = 0; @@ -173,9 +173,9 @@ static void __init MP_bus_info (struct mpc_config_bus *m) Dprintk("Bus #%d is %s\n", m->mpc_busid, str); if (strncmp(str, "ISA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + set_bit(m->mpc_busid, mp_bus_not_pci); } else if (strncmp(str, "PCI", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + clear_bit(m->mpc_busid, mp_bus_not_pci); mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; } else { @@ -808,8 +808,7 @@ void __init mp_config_acpi_legacy_irqs (void) /* * Fabricate the legacy ISA bus (bus #31). */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + set_bit(MP_ISA_BUS, mp_bus_not_pci); /* * Locate the IOAPIC that manages the ISA IRQs (0-15). diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h index 1dc83436cc91..017fddb61dc5 100644 --- a/include/asm-x86_64/mpspec.h +++ b/include/asm-x86_64/mpspec.h @@ -159,13 +159,7 @@ struct mpc_config_lintsrc #define MAX_MP_BUSSES 256 /* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */ #define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4) -enum mp_bustype { - MP_BUS_ISA = 1, - MP_BUS_EISA, - MP_BUS_PCI, - MP_BUS_MCA -}; -extern unsigned char mp_bus_id_to_type [MAX_MP_BUSSES]; +extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES]; extern unsigned int boot_cpu_physical_apicid; -- cgit v1.2.3 From a01fd3baff5e2e744f9d1af0e5d3e6b6082e4dcf Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove useless wrapper in mpparse.c code It used to contain support code for NUMAQ, but that is long gone already on 64bit. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/mpparse.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index a8d38c0bb449..79423d1bfcac 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -579,7 +579,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) return 0; } -void __init find_intel_smp (void) +void __init find_smp_config(void) { unsigned int address; @@ -617,15 +617,6 @@ void __init find_intel_smp (void) printk(KERN_INFO "No mptable found.\n"); } -/* - * - Intel MP Configuration Table - */ -void __init find_smp_config (void) -{ - find_intel_smp(); -} - - /* -------------------------------------------------------------------------- ACPI-based MP Configuration -------------------------------------------------------------------------- */ -- cgit v1.2.3 From 276ec1a76ad204d47947894a914e10e798400ffb Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Remove some unneeded ACPI externs in mpparse.c They are not used in this file so remove them. i386 didn't have them either. Cc: len.brown@intel.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/mpparse.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 79423d1bfcac..0cc88f7f288b 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -70,15 +70,6 @@ unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -/* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI -extern struct acpi_boot_flags acpi_boot; -extern int acpi_parse_lapic (acpi_table_entry_header *header); -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_ACPI*/ - u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -- cgit v1.2.3 From efec3b9a3282714c8441b9bf476f8358bed9ecaa Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Fix up some non linuxy style in ACPI functions in mpparse.c No functional changes. Cc: len.brown@intel.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/mpparse.c | 57 +++++++++++++------------------------------- 1 file changed, 16 insertions(+), 41 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 0cc88f7f288b..d63f849aea2c 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -614,23 +614,17 @@ void __init find_smp_config(void) #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } - -void __cpuinit mp_register_lapic ( - u8 id, - u8 enabled) +void __cpuinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; int boot_cpu = 0; @@ -668,11 +662,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic(int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -682,17 +674,12 @@ static int mp_find_ioapic ( } printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; + int idx = 0; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -729,16 +716,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -776,16 +757,13 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } - -void __init mp_config_acpi_legacy_irqs (void) +void __init mp_config_acpi_legacy_irqs(void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). @@ -843,24 +821,22 @@ void __init mp_config_acpi_legacy_irqs (void) if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); } - - return; } #define MAX_GSI_NUM 4096 int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrupts, which * represent all possible interrupts, to the IRQs * assigned to actual devices. */ - static int gsi_to_irq[MAX_GSI_NUM]; + static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -934,5 +910,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; } - #endif /*CONFIG_ACPI*/ -- cgit v1.2.3 From fe7414a2882c953788af13a7f2c9d570ed8f71c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] Use BUILD_BUG_ON in apic.c build sanity checking Makes code a little shorter. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 20a76f209629..496cd1bd2ae5 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -265,8 +265,6 @@ void __init sync_Arb_IDs(void) | APIC_DM_INIT); } -extern void __error_in_apic_c (void); - /* * An initial setup of the virtual wire mode. */ @@ -313,8 +311,7 @@ void __cpuinit setup_local_APIC (void) value = apic_read(APIC_LVR); - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); /* * Double-check whether this APIC is really registered. -- cgit v1.2.3 From e2414910f212c52d9d7c64c99a22863488ac5b48 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:30 +0200 Subject: [PATCH] x86: Detect CFI support in the assembler at runtime ... instead of using a CONFIG option. The config option still controls if the resulting executable actually has unwind information. This is useful to prevent compilation errors when users select CONFIG_STACK_UNWIND on old binutils and also allows to use CFI in the future for non kernel debugging applications. Cc: jbeulich@novell.com Cc: sam@ravnborg.org Signed-off-by: Andi Kleen <ak@suse.de> --- Documentation/kbuild/makefiles.txt | 5 +++++ arch/i386/Makefile | 4 ++++ arch/x86_64/Makefile | 3 +++ include/asm-x86_64/dwarf2.h | 2 +- scripts/Kbuild.include | 7 +++++++ 5 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index b7d6abb501a6..e2cbd59cf2d0 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -421,6 +421,11 @@ more details, with real examples. The second argument is optional, and if supplied will be used if first argument is not supported. + as-instr + as-instr checks if the assembler reports a specific instruction + and then outputs either option1 or option2 + C escapes are supported in the test instruction + cc-option cc-option is used to check if $(CC) supports a given option, and not supported to use an optional second option. diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 3e4adb1e2244..508cdbeb3a09 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -46,6 +46,10 @@ cflags-y += -ffreestanding # a lot more stack due to the lack of sharing of stacklots: CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;) +# do binutils support CFI? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) + CFLAGS += $(cflags-y) # Default subarch .c files diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 431bb4bc36cd..d6472ddf5f6e 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -54,6 +54,9 @@ endif cflags-y += $(call cc-option,-funit-at-a-time) # prevent gcc from generating any FP code by mistake cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +# do binutils support CFI? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) CFLAGS += $(cflags-y) CFLAGS_KERNEL += $(cflags-kernel-y) diff --git a/include/asm-x86_64/dwarf2.h b/include/asm-x86_64/dwarf2.h index 0744db777676..2b9368365fad 100644 --- a/include/asm-x86_64/dwarf2.h +++ b/include/asm-x86_64/dwarf2.h @@ -13,7 +13,7 @@ away for older version. */ -#ifdef CONFIG_UNWIND_INFO +#ifdef CONFIG_AS_CFI #define CFI_STARTPROC .cfi_startproc #define CFI_ENDPROC .cfi_endproc diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 3d523899fdc0..7adef12a0c26 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -63,6 +63,13 @@ as-option = $(shell if $(CC) $(CFLAGS) $(1) -Wa,-Z -c -o /dev/null \ -xassembler /dev/null > /dev/null 2>&1; then echo "$(1)"; \ else echo "$(2)"; fi ;) +# as-instr +# Usage: cflags-y += $(call as-instr, instr, option1, option2) + +as-instr = $(shell if echo -e "$(1)" | $(AS) -Z -o astest$$$$.out \ + 2>&1 >/dev/null ; then echo "$(2)"; else echo "$(3)"; fi; \ + rm -f astest$$$$.out) + # cc-option # Usage: cflags-y += $(call cc-option, -march=winchip-c6, -march=i586) -- cgit v1.2.3 From 44cc45267bbe7c64f7d85b074bd670b48b5abdfb Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Remove obsolete CVS $Id$ from assembler files in arch/x86_64/kernel/* CVS hasn't been used for a long time for them. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/entry.S | 2 -- arch/x86_64/kernel/head.S | 2 -- 2 files changed, 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a04fc4108ff2..2092f565aa87 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -4,8 +4,6 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - * - * $Id$ */ /* diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 505ec4a57506..4a3326ce709c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,8 +5,6 @@ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> - * - * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ */ -- cgit v1.2.3 From caff0710ebf6f2c44cbd2b8b31fd6329148bed2e Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] initialize end of memory variables as early as possible While an earlier patch already did a small step into that direction, this patch moves initialization of all memory end variables to as early as possible, so that dependent code doesn't need to check whether these variables have already been set. Also, remove a misleading (perhaps just outdated) comment, and make static a variable only used in a single file. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/e820.c | 2 +- arch/x86_64/kernel/setup.c | 7 ++++++- arch/x86_64/mm/init.c | 6 ------ 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index d6d7f731f6f0..1362aad4f3aa 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -40,7 +40,7 @@ unsigned long end_pfn_map; /* * Last pfn which the user wants to use. */ -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; extern struct resource code_resource, data_resource; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index aab4bd66aa0d..ddc3b2d9a165 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -554,7 +554,7 @@ void __init setup_arch(char **cmdline_p) * we are rounding upwards: */ end_pfn = e820_end_of_ram(); - num_physpages = end_pfn; /* for pfn_valid */ + num_physpages = end_pfn; check_efer(); @@ -574,6 +574,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif + /* How many end-of-memory variables you have, grandma! */ + max_low_pfn = end_pfn; + max_pfn = end_pfn; + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index d14fb2dfbfc4..d40134bd6399 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -597,12 +597,6 @@ void __init mem_init(void) pci_iommu_alloc(); - /* How many end-of-memory variables you have, grandma! */ - max_low_pfn = end_pfn; - max_pfn = end_pfn; - num_physpages = end_pfn; - high_memory = (void *) __va(end_pfn * PAGE_SIZE); - /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); -- cgit v1.2.3 From 3b94355c47e2b025a7ececa0a14180e8fce6b0f1 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] remove int_delivery_dest The genapic field and the accessor macro weren't used anywhere. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/genapic_cluster.c | 1 - arch/x86_64/kernel/genapic_flat.c | 2 -- include/asm-x86_64/genapic.h | 1 - include/asm-x86_64/mach_apic.h | 1 - 4 files changed, 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c index 3020917546de..cdb90e671b88 100644 --- a/arch/x86_64/kernel/genapic_cluster.c +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -118,7 +118,6 @@ struct genapic apic_cluster = { .name = "clustered", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = cluster_target_cpus, .apic_id_registered = cluster_apic_id_registered, .init_apic_ldr = cluster_init_apic_ldr, diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index eb86d374813a..2ceb8b872b57 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -121,7 +121,6 @@ struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, .target_cpus = flat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr, @@ -180,7 +179,6 @@ struct genapic apic_physflat = { .name = "physical flat", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = physflat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h index 50b38e7c58e4..81e714665344 100644 --- a/include/asm-x86_64/genapic.h +++ b/include/asm-x86_64/genapic.h @@ -16,7 +16,6 @@ struct genapic { char *name; u32 int_delivery_mode; u32 int_dest_mode; - u32 int_delivery_dest; /* for quick IPIs */ int (*apic_id_registered)(void); cpumask_t (*target_cpus)(void); void (*init_apic_ldr)(void); diff --git a/include/asm-x86_64/mach_apic.h b/include/asm-x86_64/mach_apic.h index 0acea44c9377..d33422450c00 100644 --- a/include/asm-x86_64/mach_apic.h +++ b/include/asm-x86_64/mach_apic.h @@ -16,7 +16,6 @@ #define INT_DELIVERY_MODE (genapic->int_delivery_mode) #define INT_DEST_MODE (genapic->int_dest_mode) -#define INT_DELIVERY_DEST (genapic->int_delivery_dest) #define TARGET_CPUS (genapic->target_cpus()) #define apic_id_registered (genapic->apic_id_registered) #define init_apic_ldr (genapic->init_apic_ldr) -- cgit v1.2.3 From f38db651d5da5e10235fd7dd31095969fb7ef6fb Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: consolidate per bus data structures Move the tce_table_kva array, disabled bitmap and bus_to_phb array into a new per bus 'struct calgary_bus_info'. Also slightly reorganize build_tce_table and tce_table_setparms to avoid exporting bus_info to tce.c. Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 53 +++++++++++++++++++++------------------- arch/x86_64/kernel/tce.c | 10 -------- include/asm-x86_64/tce.h | 1 - 3 files changed, 28 insertions(+), 36 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 146924ba5df5..96f6a866afad 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -111,17 +111,17 @@ static const unsigned long phb_offsets[] = { 0xB000 /* PHB3 */ }; -static char bus_to_phb[MAX_PHB_BUS_NUM]; -void* tce_table_kva[MAX_PHB_BUS_NUM]; unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; -/* - * the bitmap of PHBs the user requested that we disable - * translation on. - */ -static DECLARE_BITMAP(translation_disabled, MAX_PHB_BUS_NUM); +struct calgary_bus_info { + void *tce_space; + int translation_disabled; + signed char phbid; +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; static void tce_cache_blast(struct iommu_table *tbl); @@ -149,7 +149,7 @@ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) static inline int translate_phb(struct pci_dev* dev) { - int disabled = test_bit(dev->bus->number, translation_disabled); + int disabled = bus_info[dev->bus->number].translation_disabled; return !disabled; } @@ -454,7 +454,7 @@ static struct dma_mapping_ops calgary_dma_ops = { static inline int busno_to_phbid(unsigned char num) { - return bus_to_phb[num]; + return bus_info[num].phbid; } static inline unsigned long split_queue_offset(unsigned char num) @@ -631,6 +631,10 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) if (ret) return ret; + tbl = dev->sysdata; + tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; + tce_free(tbl, 0, tbl->it_size); + calgary_reserve_regions(dev); /* set TARs for each PHB */ @@ -824,7 +828,7 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) { + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) { pci_dev_put(dev); continue; } @@ -844,7 +848,7 @@ error: pci_dev_put(dev); continue; } - if (!tce_table_kva[dev->bus->number] && !translate_empty_slots) + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; calgary_disable_translation(dev); calgary_free_tar(dev); @@ -894,9 +898,8 @@ void __init detect_calgary(void) for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { int dev; - - tce_table_kva[bus] = NULL; - bus_to_phb[bus] = -1; + struct calgary_bus_info *info = &bus_info[bus]; + info->phbid = -1; if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; @@ -907,12 +910,9 @@ void __init detect_calgary(void) */ phb = (phb + 1) % PHBS_PER_CALGARY; - if (test_bit(bus, translation_disabled)) { - printk(KERN_INFO "Calgary: translation is disabled for " - "PHB 0x%x\n", bus); - /* skip this phb, don't allocate a tbl for it */ + if (info->translation_disabled) continue; - } + /* * Scan the slots of the PCI bus to see if there is a device present. * The parent bus will be the zero-ith device, so start at 1. @@ -923,8 +923,8 @@ void __init detect_calgary(void) tbl = alloc_tce_table(); if (!tbl) goto cleanup; - tce_table_kva[bus] = tbl; - bus_to_phb[bus] = phb; + info->tce_space = tbl; + info->phbid = phb; calgary_found = 1; break; } @@ -940,9 +940,12 @@ void __init detect_calgary(void) return; cleanup: - for (--bus; bus >= 0; --bus) - if (tce_table_kva[bus]) - free_tce_table(tce_table_kva[bus]); + for (--bus; bus >= 0; --bus) { + struct calgary_bus_info *info = &bus_info[bus]; + + if (info->tce_space) + free_tce_table(info->tce_space); + } } int __init calgary_iommu_init(void) @@ -1016,7 +1019,7 @@ static int __init calgary_parse_options(char *p) if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB 0x%x\n", bridge); - set_bit(bridge, translation_disabled); + bus_info[bridge].translation_disabled = 1; } } diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index 0905ce6b9b32..cbabfdf78e06 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -106,14 +106,6 @@ static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) /* set the tce table size - measured in entries */ tbl->it_size = table_size_to_number_of_entries(specified_table_size); - tbl->it_base = (unsigned long)tce_table_kva[dev->bus->number]; - if (!tbl->it_base) { - printk(KERN_ERR "Calgary: iommu_table_setparms: " - "no table allocated?!\n"); - ret = -ENOMEM; - goto done; - } - /* * number of bytes needed for the bitmap size in number of * entries; we need one bit per entry @@ -162,8 +154,6 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) if (ret) goto free_tbl; - tce_free(tbl, 0, tbl->it_size); - tbl->bbar = bbar; /* diff --git a/include/asm-x86_64/tce.h b/include/asm-x86_64/tce.h index 53e9a68b3336..dbb047febc5e 100644 --- a/include/asm-x86_64/tce.h +++ b/include/asm-x86_64/tce.h @@ -24,7 +24,6 @@ #ifndef _ASM_X86_64_TCE_H #define _ASM_X86_64_TCE_H -extern void* tce_table_kva[]; extern unsigned int specified_table_size; struct iommu_table; -- cgit v1.2.3 From 9f2dc46d5ec6fd7787182d2232a1003af11879f1 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: break out of pci_find_device_reverse if dev not found Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 96f6a866afad..b2182c936305 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -844,6 +844,8 @@ error: dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); + if (!dev) + break; if (!translate_phb(dev)) { pci_dev_put(dev); continue; -- cgit v1.2.3 From b8f4fe66a560b5ccbb4d4d0d2df356a5d9b98b83 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: fix error path memleak in calgary_free_tar We were freeing the iommu_table and leaking the bitmap pages. Also rename it to calgary_free_bus, which is more accurate. Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index b2182c936305..6c5da1d813f3 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -658,11 +658,12 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) return 0; } -static void __init calgary_free_tar(struct pci_dev *dev) +static void __init calgary_free_bus(struct pci_dev *dev) { u64 val64; struct iommu_table *tbl = dev->sysdata; void __iomem *target; + unsigned int bitmapsz; target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); val64 = be64_to_cpu(readq(target)); @@ -670,8 +671,15 @@ static void __init calgary_free_tar(struct pci_dev *dev) writeq(cpu_to_be64(val64), target); readq(target); /* flush */ + bitmapsz = tbl->it_size / BITS_PER_BYTE; + free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); + tbl->it_map = NULL; + kfree(tbl); dev->sysdata = NULL; + + /* Can't free bootmem allocated memory after system is up :-( */ + bus_info[dev->bus->number].tce_space = NULL; } static void calgary_watchdog(unsigned long data) @@ -853,7 +861,7 @@ error: if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; calgary_disable_translation(dev); - calgary_free_tar(dev); + calgary_free_bus(dev); pci_dev_put(dev); } -- cgit v1.2.3 From 871b17008e93d7e96f96829ce5f0393c9902d25b Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: fix reference counting of Calgary PCI devices The pci_get_device() API decrements the reference count on the 'from' parameter when it continues searching. Therefore, take a ref count on Calgary bus when we initialize them in either translated or non-translated mode. Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 6c5da1d813f3..7302834718c8 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -786,6 +786,7 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev) static int __init calgary_init_one_nontraslated(struct pci_dev *dev) { + pci_dev_get(dev); dev->sysdata = NULL; dev->bus->self = dev; @@ -810,6 +811,7 @@ static int __init calgary_init_one(struct pci_dev *dev) if (ret) goto iounmap; + pci_dev_get(dev); dev->bus->self = dev; calgary_enable_translation(dev); @@ -836,10 +838,9 @@ static int __init calgary_init(void) calgary_init_one_nontraslated(dev); continue; } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) { - pci_dev_put(dev); + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; - } + ret = calgary_init_one(dev); if (ret) goto error; @@ -860,9 +861,10 @@ error: } if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) continue; + calgary_disable_translation(dev); calgary_free_bus(dev); - pci_dev_put(dev); + pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ } return ret; -- cgit v1.2.3 From a4fc520a0ff92810eea46d74bf742ef73849302e Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: calgary_init_one_nontraslated() can return void Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 7302834718c8..f541fb564d92 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -784,13 +784,11 @@ static inline unsigned int __init locate_register_space(struct pci_dev *dev) return address; } -static int __init calgary_init_one_nontraslated(struct pci_dev *dev) +static void __init calgary_init_one_nontraslated(struct pci_dev *dev) { pci_dev_get(dev); dev->sysdata = NULL; dev->bus->self = dev; - - return 0; } static int __init calgary_init_one(struct pci_dev *dev) -- cgit v1.2.3 From 0577f148b5e9a773020e3da1e6332a7c6df9d601 Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:31 +0200 Subject: [PATCH] Calgary IOMMU: save a bit of space in bus_info Make translation_disabled a uchar rather than an int Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index f541fb564d92..caf84e4e4ca9 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -117,7 +117,7 @@ static int calgary_detected __read_mostly = 0; struct calgary_bus_info { void *tce_space; - int translation_disabled; + unsigned char translation_disabled; signed char phbid; }; -- cgit v1.2.3 From a752d7194c4fb5a3e767c95542d04fc5decb1d52 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] fix is_at_popf() for compat tasks When testing for the REX instruction prefix, first check for 32-bit mode because in compat mode the REX prefix is an increment instruction. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/ptrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index d35ec1bc696a..3a52c7bcfd12 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -141,8 +141,11 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) case 0xf0: case 0xf2: case 0xf3: continue; - /* REX prefixes */ case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ continue; /* CHECKME: f0, f2, f3 */ -- cgit v1.2.3 From 8d379dad8f1670d233ac67b76b1c5a42ad3714a3 Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] annotate arch/x86_64/lib/*.S Add unwind annotations to arch/x86_64/lib/*.S, and also use the macros provided by linux/linkage.h where-ever possible. Some of the alternative instructions handling needed to be adjusted so that the replacement code would also have valid unwind information. Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/lib/clear_page.S | 47 +++++++++++++++----------- arch/x86_64/lib/copy_page.S | 53 +++++++++++++++++++---------- arch/x86_64/lib/copy_user.S | 39 ++++++++++++++++------ arch/x86_64/lib/csum-copy.S | 26 ++++++++++++--- arch/x86_64/lib/getuser.S | 32 +++++++++++------- arch/x86_64/lib/iomap_copy.S | 10 ++++-- arch/x86_64/lib/memcpy.S | 69 +++++++++++++++++++++----------------- arch/x86_64/lib/memset.S | 79 ++++++++++++++++++++++++-------------------- arch/x86_64/lib/putuser.S | 32 +++++++++++------- 9 files changed, 244 insertions(+), 143 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 1f81b79b796c..9a10a78bb4a4 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S @@ -1,10 +1,22 @@ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * Zero a page. * rdi page */ - .globl clear_page - .p2align 4 -clear_page: + ALIGN +clear_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + xorl %eax,%eax + rep stosq + ret + CFI_ENDPROC +ENDPROC(clear_page) + +ENTRY(clear_page) + CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -23,28 +35,25 @@ clear_page: jnz .Lloop nop ret -clear_page_end: + CFI_ENDPROC +.Lclear_page_end: +ENDPROC(clear_page) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad clear_page - .quad clear_page_c - .byte X86_FEATURE_REP_GOOD - .byte clear_page_end-clear_page - .byte clear_page_c_end-clear_page_c - .previous - - .section .altinstr_replacement,"ax" -clear_page_c: - movl $4096/8,%ecx - xorl %eax,%eax - rep - stosq - ret -clear_page_c_end: + .quad clear_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lclear_page_end - clear_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 8fa19d96a7ee..0ebb03b60e79 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S @@ -1,17 +1,33 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> + + ALIGN +copy_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + rep movsq + ret + CFI_ENDPROC +ENDPROC(copy_page_c) + /* Don't use streaming store because it's better when the target ends up in cache. */ /* Could vary the prefetch distance based on SMP/UP */ - .globl copy_page - .p2align 4 -copy_page: +ENTRY(copy_page) + CFI_STARTPROC subq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET 3*8 movq %rbx,(%rsp) + CFI_REL_OFFSET rbx, 0 movq %r12,1*8(%rsp) + CFI_REL_OFFSET r12, 1*8 movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 movl $(4096/64)-5,%ecx .p2align 4 @@ -72,30 +88,33 @@ copy_page: jnz .Loop2 movq (%rsp),%rbx + CFI_RESTORE rbx movq 1*8(%rsp),%r12 + CFI_RESTORE r12 movq 2*8(%rsp),%r13 + CFI_RESTORE r13 addq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET -3*8 ret +.Lcopy_page_end: + CFI_ENDPROC +ENDPROC(copy_page) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad copy_page - .quad copy_page_c - .byte X86_FEATURE_REP_GOOD - .byte copy_page_c_end-copy_page_c - .byte copy_page_c_end-copy_page_c - .previous - - .section .altinstr_replacement,"ax" -copy_page_c: - movl $4096/8,%ecx - rep - movsq - ret -copy_page_c_end: + .quad copy_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lcopy_page_end - copy_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index f64569b83b54..962f3a693c5e 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -4,6 +4,9 @@ * Functions to copy from and to user space. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + #define FIX_ALIGNMENT 1 #include <asm/current.h> @@ -12,9 +15,8 @@ #include <asm/cpufeature.h> /* Standard copy_to_user with segment limit checking */ - .globl copy_to_user - .p2align 4 -copy_to_user: +ENTRY(copy_to_user) + CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rdi,%rcx addq %rdx,%rcx @@ -25,9 +27,11 @@ copy_to_user: .byte 0xe9 /* 32bit jump */ .long .Lcug-1f 1: + CFI_ENDPROC +ENDPROC(copy_to_user) .section .altinstr_replacement,"ax" -3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ +3: .byte 0xe9 /* replacement jmp with 32 bit immediate */ .long copy_user_generic_c-1b /* offset */ .previous .section .altinstructions,"a" @@ -40,9 +44,8 @@ copy_to_user: .previous /* Standard copy_from_user with segment limit checking */ - .globl copy_from_user - .p2align 4 -copy_from_user: +ENTRY(copy_from_user) + CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx @@ -50,10 +53,13 @@ copy_from_user: cmpq threadinfo_addr_limit(%rax),%rcx jae bad_from_user /* FALL THROUGH to copy_user_generic */ + CFI_ENDPROC +ENDPROC(copy_from_user) .section .fixup,"ax" /* must zero dest */ bad_from_user: + CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -61,6 +67,8 @@ bad_from_user: bad_to_user: movl %edx,%eax ret + CFI_ENDPROC +END(bad_from_user) .previous @@ -75,9 +83,8 @@ bad_to_user: * Output: * eax uncopied bytes or 0 if successful. */ - .globl copy_user_generic - .p2align 4 -copy_user_generic: +ENTRY(copy_user_generic) + CFI_STARTPROC .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ .byte 0x66,0x90 1: @@ -95,6 +102,8 @@ copy_user_generic: .previous .Lcug: pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 xorl %eax,%eax /*zero for the exception handler */ #ifdef FIX_ALIGNMENT @@ -168,9 +177,13 @@ copy_user_generic: decl %ecx jnz .Lloop_1 + CFI_REMEMBER_STATE .Lende: popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret + CFI_RESTORE_STATE #ifdef FIX_ALIGNMENT /* align destination */ @@ -261,6 +274,9 @@ copy_user_generic: .Le_zero: movq %rdx,%rax jmp .Lende + CFI_ENDPROC +ENDPROC(copy_user_generic) + /* Some CPUs run faster using the string copy instructions. This is also a lot simpler. Use them when possible. @@ -282,6 +298,7 @@ copy_user_generic: * this please consider this. */ copy_user_generic_c: + CFI_STARTPROC movl %edx,%ecx shrl $3,%ecx andl $7,%edx @@ -294,6 +311,8 @@ copy_user_generic_c: ret 3: lea (%rdx,%rcx,8),%rax ret + CFI_ENDPROC +END(copy_user_generic_c) .section __ex_table,"a" .quad 1b,3b diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S index 72fd55ee896e..f0dba36578ea 100644 --- a/arch/x86_64/lib/csum-copy.S +++ b/arch/x86_64/lib/csum-copy.S @@ -5,8 +5,9 @@ * License. See the file COPYING in the main directory of this archive * for more details. No warranty for anything given at all. */ - #include <linux/linkage.h> - #include <asm/errno.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/errno.h> /* * Checksum copy with exception handling. @@ -53,19 +54,24 @@ .endm - .globl csum_partial_copy_generic - .p2align 4 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC cmpl $3*64,%edx jle .Lignore .Lignore: subq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx,2*8(%rsp) + CFI_REL_OFFSET rbx, 2*8 movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 movq %r14,4*8(%rsp) + CFI_REL_OFFSET r14, 4*8 movq %r13,5*8(%rsp) + CFI_REL_OFFSET r13, 5*8 movq %rbp,6*8(%rsp) + CFI_REL_OFFSET rbp, 6*8 movq %r8,(%rsp) movq %r9,1*8(%rsp) @@ -208,14 +214,22 @@ csum_partial_copy_generic: addl %ebx,%eax adcl %r9d,%eax /* carry */ + CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp),%rbx + CFI_RESTORE rbx movq 3*8(%rsp),%r12 + CFI_RESTORE r12 movq 4*8(%rsp),%r14 + CFI_RESTORE r14 movq 5*8(%rsp),%r13 + CFI_RESTORE r13 movq 6*8(%rsp),%rbp + CFI_RESTORE rbp addq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET -7*8 ret + CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -231,3 +245,5 @@ csum_partial_copy_generic: jz .Lende movl $-EFAULT,(%rax) jmp .Lende + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S index 3844d5e885a4..5448876261f8 100644 --- a/arch/x86_64/lib/getuser.S +++ b/arch/x86_64/lib/getuser.S @@ -27,25 +27,26 @@ */ #include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/page.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> .text - .p2align 4 -.globl __get_user_1 -__get_user_1: +ENTRY(__get_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_get_user 1: movzb (%rcx),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_1) - .p2align 4 -.globl __get_user_2 -__get_user_2: +ENTRY(__get_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -57,10 +58,11 @@ __get_user_2: ret 20: decq %rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_2) - .p2align 4 -.globl __get_user_4 -__get_user_4: +ENTRY(__get_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -72,10 +74,11 @@ __get_user_4: ret 30: subq $3,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_4) - .p2align 4 -.globl __get_user_8 -__get_user_8: +ENTRY(__get_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -87,11 +90,16 @@ __get_user_8: ret 40: subq $7,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_8) bad_get_user: + CFI_STARTPROC xorl %edx,%edx movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_get_user) .section __ex_table,"a" .quad 1b,bad_get_user diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S index 8bbade5fea05..05a95e713da8 100644 --- a/arch/x86_64/lib/iomap_copy.S +++ b/arch/x86_64/lib/iomap_copy.S @@ -15,12 +15,16 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * override generic version in lib/iomap_copy.c */ - .globl __iowrite32_copy - .p2align 4 -__iowrite32_copy: +ENTRY(__iowrite32_copy) + CFI_STARTPROC movl %edx,%ecx rep movsd ret + CFI_ENDPROC +ENDPROC(__iowrite32_copy) diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 5554948b5554..967b22fa7d07 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -1,6 +1,10 @@ /* Copyright 2002 Andi Kleen */ - #include <asm/cpufeature.h> +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeature.h> + /* * memcpy - Copy a memory block. * @@ -13,12 +17,26 @@ * rax original destination */ - .globl __memcpy - .globl memcpy - .p2align 4 -__memcpy: -memcpy: + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 movq %rdi,%rax movl %edx,%ecx @@ -86,36 +104,27 @@ memcpy: .Lende: popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret .Lfinal: + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memcpy - .quad memcpy_c - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal-memcpy - .byte memcpy_c_end-memcpy_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi source - * rdx count - */ -memcpy_c: - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep - movsq - movl %edx,%ecx - rep - movsb - ret -memcpy_c_end: + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memcpy + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index ad397f2c7de8..09ed1f6b0eaa 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -1,4 +1,9 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * ISO C memset - set a memory block to a byte value. * @@ -8,11 +13,29 @@ * * rax original destination */ - .globl __memset - .globl memset - .p2align 4 -memset: -__memset: + ALIGN +memset_c: + CFI_STARTPROC + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep stosq + movl %r8d,%ecx + rep stosb + movq %r9,%rax + ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC movq %rdi,%r10 movq %rdx,%r11 @@ -25,6 +48,7 @@ __memset: movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment + CFI_REMEMBER_STATE .Lafter_bad_alignment: movl %r11d,%ecx @@ -75,6 +99,7 @@ __memset: movq %r10,%rax ret + CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%r11 jbe .Lhandle_7 @@ -84,42 +109,26 @@ __memset: addq %r8,%rdi subq %r8,%r11 jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memset - .quad memset_c - .byte X86_FEATURE_REP_GOOD - .byte memset_c_end-memset_c - .byte memset_c_end-memset_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi value - * rdx count - */ -memset_c: - movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep - stosq - movl %r8d,%ecx - rep - stosb - movq %r9,%rax - ret -memset_c_end: + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S index 7f5593974e2d..4989f5a8fa9b 100644 --- a/arch/x86_64/lib/putuser.S +++ b/arch/x86_64/lib/putuser.S @@ -25,25 +25,26 @@ */ #include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/page.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> .text - .p2align 4 -.globl __put_user_1 -__put_user_1: +ENTRY(__put_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_put_user 1: movb %dl,(%rcx) xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__put_user_1) - .p2align 4 -.globl __put_user_2 -__put_user_2: +ENTRY(__put_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -55,10 +56,11 @@ __put_user_2: ret 20: decq %rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_2) - .p2align 4 -.globl __put_user_4 -__put_user_4: +ENTRY(__put_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -70,10 +72,11 @@ __put_user_4: ret 30: subq $3,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_4) - .p2align 4 -.globl __put_user_8 -__put_user_8: +ENTRY(__put_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -85,10 +88,15 @@ __put_user_8: ret 40: subq $7,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_8) bad_put_user: + CFI_STARTPROC movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_put_user) .section __ex_table,"a" .quad 1b,bad_put_user -- cgit v1.2.3 From 29a6c25bd63dbd52e73d25268402d065ad84c0fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Fix gdt table size in trampoline.S Allows easier extension of the GDT by using the proper C symbol for the size in the descriptor. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/trampoline.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index 23a03eb91fc7..c79b99a9e2f6 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -64,7 +64,7 @@ idt_48: .word 0, 0 # idt base = 0L gdt_48: - .short __KERNEL32_CS + 7 # gdt limit + .short GDT_ENTRIES*8 - 1 # gdt limit .long cpu_gdt_table-__START_KERNEL_map .globl trampoline_end -- cgit v1.2.3 From 145106e81090fcdfb57b7f2e90b98c6445b6f33d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] remove superflous BUG_ON's in nommu and gart There's no need to check for invalid DMA data direction in nommu and gart since we do it in dma-mapping.h anyway before calling the individual dma-ops. Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-gart.c | 3 --- arch/x86_64/kernel/pci-nommu.c | 1 - 2 files changed, 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 6d3e61baf7a0..16261a8a3303 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -239,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; - BUG_ON(dir == DMA_NONE); - if (!dev) dev = &fallback_dev; @@ -383,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) unsigned long pages = 0; int need = 0, nextneed; - BUG_ON(dir == DMA_NONE); if (nents == 0) return 0; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index aad7609d8e92..df09ab05a1bd 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -59,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, { int i; - BUG_ON(direction == DMA_NONE); for (i = 0; i < nents; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); -- cgit v1.2.3 From d4d35854a1cf7fb170c8e133a5e40a9d19e7fc17 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] remove lock prefix from is_at_popf() tests The lock prefix will cause an exception when used with the popf instruction, so no need to continue searching after it's found. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 3a52c7bcfd12..412c6a8bf906 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -138,7 +138,7 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) case 0x26: case 0x2e: case 0x36: case 0x3e: case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: + case 0xf2: case 0xf3: continue; case 0x40 ... 0x4f: @@ -148,7 +148,7 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) /* 64-bit mode: REX prefix */ continue; - /* CHECKME: f0, f2, f3 */ + /* CHECKME: f2, f3 */ /* * pushf: NOTE! We should probably not let -- cgit v1.2.3 From 9ca33eb6981549c0d1b7aea7f99f1ba602161356 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Use early CPU identify before early command line parsing This makes it possible to modify CPU flags in command line options without hacks. And remove another copy in head64.c Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/head64.c | 19 ------------------- arch/x86_64/kernel/setup.c | 4 ++-- 2 files changed, 2 insertions(+), 21 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index bacbd75c63ad..f2461fde9f8f 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -56,24 +56,6 @@ static void __init copy_bootdata(char *real_mode_data) printk("Bootdata ok (command line is %s)\n", saved_command_line); } -static void __init setup_boot_cpu_data(void) -{ - unsigned int dummy, eax; - - /* get vendor info */ - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, - (unsigned int *)&boot_cpu_data.x86_vendor_id[0], - (unsigned int *)&boot_cpu_data.x86_vendor_id[8], - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); - - /* get cpu type */ - cpuid(1, &eax, &dummy, &dummy, - (unsigned int *) &boot_cpu_data.x86_capability); - boot_cpu_data.x86 = (eax >> 8) & 0xf; - boot_cpu_data.x86_model = (eax >> 4) & 0xf; - boot_cpu_data.x86_mask = eax & 0xf; -} - void __init x86_64_start_kernel(char * real_mode_data) { char *s; @@ -117,6 +99,5 @@ void __init x86_64_start_kernel(char * real_mode_data) if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) panic("Kernel too big for kernel mapping\n"); - setup_boot_cpu_data(); start_kernel(); } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index ddc3b2d9a165..73f1cdd140fe 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -545,10 +545,10 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; - parse_cmdline_early(cmdline_p); - early_identify_cpu(&boot_cpu_data); + parse_cmdline_early(cmdline_p); + /* * partially used pages are not usable - thus * we are rounding upwards: -- cgit v1.2.3 From 2c8c0e6b8d7700a990da8d24eff767f9ca223b96 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Convert x86-64 to early param Instead of hackish manual parsing Requires earlier i386 patchkit, but also fixes i386 early_printk again. I removed some obsolete really early parameters which didn't do anything useful. Also made a few parameters that needed it early (mostly oops printing setup) Also removed one panic check that wasn't visible without early console anyways (the early console is now initialized after that panic) This cleans up a lot of code. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 34 ++++--- arch/x86_64/kernel/e820.c | 53 ++++++++-- arch/x86_64/kernel/early_printk.c | 20 ++-- arch/x86_64/kernel/head64.c | 15 --- arch/x86_64/kernel/io_apic.c | 15 ++- arch/x86_64/kernel/machine_kexec.c | 28 ++++++ arch/x86_64/kernel/pci-dma.c | 7 +- arch/x86_64/kernel/setup.c | 197 ++++--------------------------------- arch/x86_64/kernel/setup64.c | 8 +- arch/x86_64/kernel/smpboot.c | 6 +- arch/x86_64/kernel/traps.c | 24 +++-- arch/x86_64/mm/numa.c | 9 +- include/asm-x86_64/apic.h | 4 +- include/asm-x86_64/e820.h | 3 +- include/asm-x86_64/pgtable.h | 1 - include/asm-x86_64/proto.h | 10 -- 16 files changed, 159 insertions(+), 275 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 496cd1bd2ae5..660602c51f51 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -36,6 +36,7 @@ #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> +#include <asm/apic.h> int apic_verbosity; int apic_runs_main_timer; @@ -546,18 +547,24 @@ static void apic_pm_activate(void) { } static int __init apic_set_verbosity(char *str) { + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; + } if (strcmp("debug", str) == 0) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; - else + else { printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug", str); + " use apic=verbose or apic=debug\n", str); + return -EINVAL; + } - return 1; + return 0; } - -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); /* * Detect and enable local APICs on non-SMP boards. @@ -1078,14 +1085,17 @@ int __init APIC_init_uniprocessor (void) static __init int setup_disableapic(char *str) { disable_apic = 1; - return 1; -} + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; +} +early_param("disableapic", setup_disableapic); +/* same as disableapic, for compatibility */ static __init int setup_nolapic(char *str) { - disable_apic = 1; - return 1; + return setup_disableapic(str); } +early_param("nolapic", setup_nolapic); static __init int setup_noapictimer(char *str) { @@ -1118,11 +1128,5 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); -/* dummy parsing: see setup.c */ - -__setup("disableapic", setup_disableapic); -__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ - __setup("noapictimer", setup_noapictimer); -/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 1362aad4f3aa..a4739dc82937 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -596,31 +596,64 @@ void __init setup_memory_region(void) e820_print_map(who); } -void __init parse_memopt(char *p, char **from) -{ - end_user_pfn = memparse(p, from); +static int __init parse_memopt(char *p) +{ + if (!p) + return -EINVAL; + end_user_pfn = memparse(p, &p); end_user_pfn >>= PAGE_SHIFT; + return 0; } +early_param("mem", parse_memopt); + +static int userdef __initdata; -void __init parse_memmapopt(char *p, char **from) +static int __init parse_memmap_opt(char *p) { + char *oldp; unsigned long long start_at, mem_size; - mem_size = memparse(p, from); - p = *from; + if (!strcmp(p, "exactmap")) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram(); +#endif + end_pfn_map = 0; + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; if (*p == '@') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); } else if (*p == '#') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_ACPI); } else if (*p == '$') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RESERVED); } else { end_user_pfn = (mem_size >> PAGE_SHIFT); } - p = *from; + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); + +void finish_e820_parsing(void) +{ + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } } unsigned long pci_mem_start = 0xaeedbabe; diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 140051e07fa6..e22ecd54870d 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...) static int __initdata keep_early; -int __init setup_early_printk(char *opt) +static int __init setup_early_printk(char *buf) { - char *space; - char buf[256]; + if (!buf) + return 0; if (early_console_initialized) - return 1; - - strlcpy(buf,opt,sizeof(buf)); - space = strchr(buf, ' '); - if (space) - *space = 0; + return 0; + early_console_initialized = 1; - if (strstr(buf,"keep")) + if (!strcmp(buf,"keep")) keep_early = 1; if (!strncmp(buf, "serial", 6)) { @@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt) early_console = &simnow_console; keep_early = 1; } - early_console_initialized = 1; register_console(early_console); return 0; } +early_param("earlyprintk", setup_early_printk); + void __init disable_early_printk(void) { if (!early_console_initialized || !early_console) @@ -266,4 +263,3 @@ void __init disable_early_printk(void) } } -__setup("earlyprintk=", setup_early_printk); diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index f2461fde9f8f..6716cbfc34ac 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -58,7 +58,6 @@ static void __init copy_bootdata(char *real_mode_data) void __init x86_64_start_kernel(char * real_mode_data) { - char *s; int i; for (i = 0; i < 256; i++) @@ -85,19 +84,5 @@ void __init x86_64_start_kernel(char * real_mode_data) #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(strchr(s, '=') + 1); -#ifdef CONFIG_NUMA - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; - /* You need early console to see that */ - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) - panic("Kernel too big for kernel mapping\n"); - start_kernel(); } diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index a1412042b918..afac3dbb3729 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; int timer_over_8254 __initdata = 0; @@ -253,18 +253,17 @@ int ioapic_force; static int __init disable_ioapic_setup(char *str) { skip_ioapic_setup = 1; - return 1; + return 0; } +early_param("noapic", disable_ioapic_setup); -static int __init enable_ioapic_setup(char *str) +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) { - ioapic_force = 1; - skip_ioapic_setup = 0; + disable_timer_pin_1 = 1; return 1; } - -__setup("noapic", disable_ioapic_setup); -__setup("apic", enable_ioapic_setup); +__setup("disable_timer_pin_1", disable_timer_pin_setup); static int __init setup_disable_8254_timer(char *s) { diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 106076b370fc..2e94c072d84a 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -226,3 +226,31 @@ NORET_TYPE void machine_kexec(struct kimage *image) rnk = (relocate_new_kernel_t) control_code_buffer; (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); } + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init setup_crashkernel(char *arg) +{ + unsigned long size, base; + char *p; + if (!arg) + return -EINVAL; + size = memparse(arg, &p); + if (arg == p) + return -EINVAL; + if (*p == '@') { + base = memparse(p+1, &p); + /* FIXME: Do I want a sanity check to validate the + * memory range? Yes you do, but it's too early for + * e820 -AK */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; +} +early_param("crashkernel", setup_crashkernel); + diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 9c44f4f2433d..4dcb671bd19f 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -236,6 +236,9 @@ __init int iommu_setup(char *p) { iommu_merge = 1; + if (!p) + return -EINVAL; + while (*p) { if (!strncmp(p,"off",3)) no_iommu = 1; @@ -278,9 +281,9 @@ __init int iommu_setup(char *p) if (*p == ',') ++p; } - return 1; + return 0; } -__setup("iommu=", iommu_setup); +early_param("iommu", iommu_setup); void __init pci_iommu_alloc(void) { diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 73f1cdd140fe..f55540a6084e 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -76,11 +76,6 @@ unsigned long mmu_cr4_features; int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI -extern int __initdata acpi_ht; -extern acpi_interrupt_flags acpi_sci_flags; -int __initdata acpi_force = 0; -#endif int acpi_numa __initdata; @@ -276,183 +271,22 @@ static void __init probe_roms(void) } } -/* Check for full argument with no trailing characters */ -static int fullarg(char *p, char *arg) +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) { - int l = strlen(arg); - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; } - -static __init void parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0; - int userdef = 0; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (fullarg(from,"acpi=off")) - disable_acpi(); - - if (fullarg(from, "acpi=force")) { - /* add later when we do DMI horrors: */ - acpi_force = 1; - acpi_disabled = 0; - } - - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (fullarg(from, "acpi=ht")) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - else if (fullarg(from, "pci=noacpi")) - acpi_disable_pci(); - else if (fullarg(from, "acpi=noirq")) - acpi_noirq_set(); - - else if (fullarg(from, "acpi_sci=edge")) - acpi_sci_flags.trigger = 1; - else if (fullarg(from, "acpi_sci=level")) - acpi_sci_flags.trigger = 3; - else if (fullarg(from, "acpi_sci=high")) - acpi_sci_flags.polarity = 1; - else if (fullarg(from, "acpi_sci=low")) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (fullarg(from, "acpi=strict")) { - acpi_strict = 1; - } - else if (fullarg(from, "acpi_skip_timer_override")) - acpi_skip_timer_override = 1; +early_param("elfcorehdr", setup_elfcorehdr); #endif - if (fullarg(from, "disable_timer_pin_1")) - disable_timer_pin_1 = 1; - if (fullarg(from, "enable_timer_pin_1")) - disable_timer_pin_1 = -1; - - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - disable_apic = 1; - } - - if (fullarg(from, "noapic")) - skip_ioapic_setup = 1; - - if (fullarg(from,"apic")) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); - - if (!memcmp(from, "memmap=", 7)) { - /* exactmap option is for used defined memory */ - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - saved_max_pfn = e820_end_of_ram(); -#endif - from += 8+7; - end_pfn_map = 0; - e820.nr_map = 0; - userdef = 1; - } - else { - parse_memmapopt(from+7, &from); - userdef = 1; - } - } - -#ifdef CONFIG_NUMA - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); -#endif - - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } - - if (fullarg(from,"oops=panic")) - panic_on_oops = 1; - - if (!memcmp(from, "noexec=", 7)) - nonx_setup(from + 7); - -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } -#endif - -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ - else if(!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif - -#ifdef CONFIG_HOTPLUG_CPU - else if (!memcmp(from, "additional_cpus=", 16)) - setup_additional_cpus(from+16); -#endif - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } - *to = '\0'; - *cmdline_p = command_line; -} - #ifndef CONFIG_NUMA static void __init contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) @@ -547,7 +381,12 @@ void __init setup_arch(char **cmdline_p) early_identify_cpu(&boot_cpu_data); - parse_cmdline_early(cmdline_p); + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + + finish_e820_parsing(); /* * partially used pages are not usable - thus diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 417de564456e..646caa0a20bb 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes. on Enable(default) off Disable */ -int __init nonx_setup(char *str) +static int __init nonx_setup(char *str) { + if (!str) + return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; do_not_nx = 0; @@ -55,9 +57,9 @@ int __init nonx_setup(char *str) do_not_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } - return 1; + return 0; } -__setup("noexec=", nonx_setup); /* parsed early actually */ +early_param("noexec", nonx_setup); int force_personality32 = 0; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index ae7d6d84e352..0b9f28b05e7d 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -1270,11 +1270,11 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } -__init int setup_additional_cpus(char *s) +static __init int setup_additional_cpus(char *s) { - return get_option(&s, &additional_cpus); + return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; } -__setup("additional_cpus=", setup_additional_cpus); +early_param("additional_cpus", setup_additional_cpus); #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 56d7ff0c894c..9ec2b1d5893d 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -1119,24 +1119,30 @@ void __init trap_init(void) } -/* Actual parsing is done early in setup.c. */ -static int __init oops_dummy(char *s) +static int __init oops_setup(char *s) { - panic_on_oops = 1; - return 1; + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; } -__setup("oops=", oops_dummy); +early_param("oops", oops_setup); static int __init kstack_setup(char *s) { + if (!s) + return -EINVAL; kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 1; + return 0; } -__setup("kstack=", kstack_setup); +early_param("kstack", kstack_setup); #ifdef CONFIG_STACK_UNWIND static int __init call_trace_setup(char *s) { + if (!s) + return -EINVAL; if (strcmp(s, "old") == 0) call_trace = -1; else if (strcmp(s, "both") == 0) @@ -1145,7 +1151,7 @@ static int __init call_trace_setup(char *s) call_trace = 1; else if (strcmp(s, "new") == 0) call_trace = 2; - return 1; + return 0; } -__setup("call_trace=", call_trace_setup); +early_param("call_trace", call_trace_setup); #endif diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index b2fac14baac0..d64d6d93d04b 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -348,9 +348,10 @@ void __init paging_init(void) } } -/* [numa=off] */ -__init int numa_setup(char *opt) +static __init int numa_setup(char *opt) { + if (!opt) + return -EINVAL; if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU @@ -366,9 +367,11 @@ __init int numa_setup(char *opt) if (!strncmp(opt,"hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif - return 1; + return 0; } +early_param("numa", numa_setup); + /* * Setup early cpu_to_node. * diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 8ed0f4d67b8d..29ee735278f2 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -17,6 +17,7 @@ extern int apic_verbosity; extern int apic_runs_main_timer; +extern int ioapic_force; /* * Define the default level of output to be very little @@ -93,9 +94,6 @@ extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, #define K8_APIC_EXT_INT_MSG_EXT 0x7 #define K8_APIC_EXT_LVT_ENTRY_THRESHOLD 0 -extern int disable_timer_pin_1; - - void smp_send_timer_broadcast_ipi(void); void switch_APIC_timer_to_ipi(void *cpumask); void switch_ipi_to_APIC_timer(void *cpumask); diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index 670a3388e70a..dba012151856 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -55,8 +55,7 @@ extern void e820_setup_gap(void); extern unsigned long e820_hole_size(unsigned long start_pfn, unsigned long end_pfn); -extern void __init parse_memopt(char *p, char **end); -extern void __init parse_memmapopt(char *p, char **end); +extern void finish_e820_parsing(void); extern struct e820map e820; diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 0c1e2422400a..d79e7441b513 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -21,7 +21,6 @@ extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt -extern int nonx_setup(char *str); extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 58fec91318e4..151826f8ce69 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -53,9 +53,6 @@ extern int nohpet; extern unsigned long vxtime_hz; extern void time_init_gtod(void); -extern int numa_setup(char *opt); - -extern int setup_early_printk(char *); extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2))); extern void early_identify_cpu(struct cpuinfo_x86 *c); @@ -104,13 +101,7 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c); extern unsigned long table_start, table_end; extern int exception_trace; -extern int using_apic_timer; -extern int disable_apic; extern unsigned cpu_khz; -extern int ioapic_force; -extern int skip_ioapic_setup; -extern int acpi_ht; -extern int acpi_disabled; extern void no_iommu_init(void); extern int force_iommu, no_iommu; @@ -132,7 +123,6 @@ extern int fix_aperture; extern int reboot_force; extern int notsc_setup(char *); -extern int setup_additional_cpus(char *); extern void smp_local_timer_interrupt(struct pt_regs * regs); -- cgit v1.2.3 From 43c85c9c5dff76efc1e411d3302840027ea92004 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:32 +0200 Subject: [PATCH] Remove need for early lockdep init I think it was only needed for the printks and we can do them later. I put in a single early_printk so that we know the kernel is alive (early_printk doesn't need any locks) This makes some things easier for initialization of unwind for lockdep, which is needed by later patches. cc: mingo@elte.hu Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/head64.c | 8 +------- arch/x86_64/kernel/setup.c | 2 ++ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 6716cbfc34ac..9561eb3c5b5c 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -45,15 +45,12 @@ static void __init copy_bootdata(char *real_mode_data) new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { - printk("so old bootloader that it does not support commandline?!\n"); return; } new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; - printk("old bootloader convention, maybe loadlin?\n"); } command_line = (char *) ((u64)(new_data)); memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); - printk("Bootdata ok (command line is %s)\n", saved_command_line); } void __init x86_64_start_kernel(char * real_mode_data) @@ -65,10 +62,7 @@ void __init x86_64_start_kernel(char * real_mode_data) asm volatile("lidt %0" :: "m" (idt_descr)); clear_bss(); - /* - * This must be called really, really early: - */ - lockdep_init(); + early_printk("Kernel alive\n"); /* * switch to init_level4_pgt from boot_level4_pgt diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index f55540a6084e..9c4d93c53c06 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -353,6 +353,8 @@ static void discover_ebda(void) void __init setup_arch(char **cmdline_p) { + printk(KERN_INFO "Command line: %s\n", saved_command_line); + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; edid_info = EDID_INFO; -- cgit v1.2.3 From df3bb57d2c0160ccd1ee51322f50aa295c3b0858 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: Move acpi_disabled variables into acpi/boot.c Removes code duplication between i386/x86-64. Not needed anymore in setup.c since early_param cleanup Cc: len.brown@intel.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/acpi/boot.c | 7 +++++++ arch/i386/kernel/setup.c | 7 ------- arch/x86_64/kernel/setup.c | 3 --- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 87e2ab50b694..f3761b98c8dd 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -38,6 +38,13 @@ int __initdata acpi_force = 0; +#ifdef CONFIG_ACPI +int acpi_disabled = 0; +#else +int acpi_disabled = 1; +#endif +EXPORT_SYMBOL(acpi_disabled); + #ifdef CONFIG_X86_64 extern void __init clustered_apic_check(void); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index c6e31ed386f5..ea17567dbe72 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -89,13 +89,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -#ifdef CONFIG_ACPI - int acpi_disabled = 0; -#else - int acpi_disabled = 1; -#endif -EXPORT_SYMBOL(acpi_disabled); - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; #ifdef CONFIG_MCA diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 9c4d93c53c06..0f86976811d6 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -74,9 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_disabled; -EXPORT_SYMBOL(acpi_disabled); - int acpi_numa __initdata; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ -- cgit v1.2.3 From c31fbb1ad890b11f037c16496e53f28877c12722 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Clean up acpi_numa variable Move it into srat.c No need to clutter up setup.c for it And remove use in setup.c completely - it only guarded a printk which can be done unconditionally. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup.c | 5 +---- arch/x86_64/mm/srat.c | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0f86976811d6..b85528597aa8 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -74,8 +74,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_numa __initdata; - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -812,8 +810,7 @@ static void srat_detect_node(void) node = first_node(node_online_map); numa_set_node(cpu, node); - if (acpi_numa > 0) - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 502fce65e96a..ca10701e7a90 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -21,6 +21,8 @@ #include <asm/numa.h> #include <asm/e820.h> +int acpi_numa __initdata; + #if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ && !defined(CONFIG_MEMORY_HOTPLUG) -- cgit v1.2.3 From 3bd4d18cbab622c504f131f3c0029c3aa29c05be Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Move e820 map into e820.c Minor cleanup. Keep setup.c free from unrelated clutter. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/e820.c | 2 ++ arch/x86_64/kernel/setup.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a4739dc82937..164d0b83dc92 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -24,6 +24,8 @@ #include <asm/bootsetup.h> #include <asm/sections.h> +struct e820map e820 __initdata; + /* * PFN of last memory page. */ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index b85528597aa8..4be29832e3dc 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -97,7 +97,6 @@ struct sys_desc_table_struct { struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); -struct e820map e820; extern int root_mountflags; -- cgit v1.2.3 From 131cfd7bd54767ec8959e013f83839442a54d546 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Add sparse annotation to vsyscall.c Fixes linux/arch/x86_64/kernel/vsyscall.c:276:7: warning: constant 0x0f40000000000 is so big it is long linux/arch/x86_64/kernel/vsyscall.c:80:14: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:80:14: expected void const volatile [noderef] *addr<asn:2> linux/arch/x86_64/kernel/vsyscall.c:80:14: got void *<noident> linux/arch/x86_64/kernel/vsyscall.c:200:7: warning: incorrect type in assignment (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:200:7: expected unsigned short [usertype] *map1 linux/arch/x86_64/kernel/vsyscall.c:200:7: got void [noderef] *<asn:2> linux/arch/x86_64/kernel/vsyscall.c:203:7: warning: incorrect type in assignment (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:203:7: expected unsigned short [usertype] *map2 linux/arch/x86_64/kernel/vsyscall.c:203:7: got void [noderef] *<asn:2> linux/arch/x86_64/kernel/vsyscall.c:215:10: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:215:10: expected void volatile [noderef] *addr<asn:2> linux/arch/x86_64/kernel/vsyscall.c:215:10: got unsigned short [usertype] *map2 linux/arch/x86_64/kernel/vsyscall.c:217:10: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/kernel/vsyscall.c:217:10: expected void volatile [noderef] *addr<asn:2> linux/arch/x86_64/kernel/vsyscall.c:217:10: got unsigned short [usertype] *map1 Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/vsyscall.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 902783bc4d53..ac48c3857ddb 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -77,7 +77,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) __vxtime.tsc_quot) >> 32; /* See comment in x86_64 do_gettimeofday. */ } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - + usec += ((readl((void __iomem *) + fix_to_virt(VSYSCALL_HPET) + 0xf0) - __vxtime.last) * __vxtime.quot) >> 32; } } while (read_seqretry(&__xtime_lock, sequence)); @@ -191,7 +192,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { extern u16 vsysc1, vsysc2; - u16 *map1, *map2; + u16 __iomem *map1; + u16 __iomem *map2; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (!write) return ret; @@ -206,11 +208,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, goto out; } if (!sysctl_vsyscall) { - *map1 = SYSCALL; - *map2 = SYSCALL; + writew(SYSCALL, map1); + writew(SYSCALL, map2); } else { - *map1 = NOP2; - *map2 = NOP2; + writew(NOP2, map1); + writew(NOP2, map2); } iounmap(map2); out: -- cgit v1.2.3 From dd2994f619752fb731f21c89ad16536dd6673948 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Add sparse annotations to quiet sparse in arch/x86_64/mm/fault.c Fixes linux/arch/x86_64/mm/fault.c:125:7: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/mm/fault.c:125:7: expected void [noderef] *<noident><asn:1> linux/arch/x86_64/mm/fault.c:125:7: got unsigned char *[assigned] instr linux/arch/x86_64/mm/fault.c:163:8: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/mm/fault.c:163:8: expected void [noderef] *<noident><asn:1> linux/arch/x86_64/mm/fault.c:163:8: got unsigned char *[assigned] instr linux/arch/x86_64/mm/fault.c:179:9: warning: incorrect type in argument 1 (different address spaces) linux/arch/x86_64/mm/fault.c:179:9: expected void [noderef] *<noident><asn:1> linux/arch/x86_64/mm/fault.c:179:9: got unsigned long *<noident> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/mm/fault.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index ac8ea66ccb94..449a437f5d70 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -102,7 +102,7 @@ void bust_spinlocks(int yes) static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned long error_code) { - unsigned char *instr; + unsigned char __user *instr; int scan_more = 1; int prefetch = 0; unsigned char *max_instr; @@ -111,7 +111,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_rip_to_linear(current, regs); + instr = (unsigned char __user *)convert_rip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) @@ -122,7 +122,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned char instr_hi; unsigned char instr_lo; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; instr_hi = opcode & 0xf0; @@ -160,7 +160,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); @@ -176,7 +176,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, static int bad_address(void *p) { unsigned long dummy; - return __get_user(dummy, (unsigned long *)p); + return __get_user(dummy, (unsigned long __user *)p); } void dump_pagetable(unsigned long address) -- cgit v1.2.3 From ddb15ec130d38cb8076a9926040c7435b126db65 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Fix most sparse warnings in sys_ia32.c Mostly by adding casts. I didn't touch the "invalid access past ..." which are caused by the sigset conversion. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/sys_ia32.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 9c130993380d..33cd0a4fe388 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -60,6 +60,7 @@ #include <linux/highuid.h> #include <linux/vmalloc.h> #include <linux/fsnotify.h> +#include <linux/sysctl.h> #include <asm/mman.h> #include <asm/types.h> #include <asm/uaccess.h> @@ -389,7 +390,9 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, } } set_fs (KERNEL_DS); - ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL, + ret = sys_rt_sigprocmask(how, + set ? (sigset_t __user *)&s : NULL, + oset ? (sigset_t __user *)&s : NULL, sigsetsize); set_fs (old_fs); if (ret) return ret; @@ -541,7 +544,7 @@ sys32_sysinfo(struct sysinfo32 __user *info) int bitcount = 0; set_fs (KERNEL_DS); - ret = sys_sysinfo(&s); + ret = sys_sysinfo((struct sysinfo __user *)&s); set_fs (old_fs); /* Check to see if any memory value is too large for 32-bit and scale @@ -589,7 +592,7 @@ sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *int mm_segment_t old_fs = get_fs (); set_fs (KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, &t); + ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); set_fs (old_fs); if (put_compat_timespec(&t, interval)) return -EFAULT; @@ -605,7 +608,7 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) mm_segment_t old_fs = get_fs(); set_fs (KERNEL_DS); - ret = sys_rt_sigpending(&s, sigsetsize); + ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); set_fs (old_fs); if (!ret) { switch (_NSIG_WORDS) { @@ -630,7 +633,7 @@ sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; set_fs (KERNEL_DS); - ret = sys_rt_sigqueueinfo(pid, sig, &info); + ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); set_fs (old_fs); return ret; } @@ -666,9 +669,6 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) size_t oldlen; int __user *namep; long ret; - extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp, - void *newval, size_t newlen); - if (copy_from_user(&a32, args32, sizeof (a32))) return -EFAULT; @@ -692,7 +692,8 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) set_fs(KERNEL_DS); lock_kernel(); - ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen); + ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen, + newvalp, (size_t) a32.newlen); unlock_kernel(); set_fs(old_fs); @@ -743,7 +744,8 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) return -EFAULT; set_fs(KERNEL_DS); - ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count); + ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, + count); set_fs(old_fs); if (offset && put_user(of, offset)) @@ -831,7 +833,7 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) seg = get_fs(); set_fs(KERNEL_DS); - ret = sys_ustat(dev,&u); + ret = sys_ustat(dev, (struct ustat __user *)&u); set_fs(seg); if (ret >= 0) { if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || -- cgit v1.2.3 From 52d522f53f137c7903db22f9196a48ad8658fb2b Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Fix sparse warnings in compat aout code Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ia32_aout.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index 3bf58af98936..396d3c100011 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c @@ -333,7 +333,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) return error; } - error = bprm->file->f_op->read(bprm->file, (char *)text_addr, + error = bprm->file->f_op->read(bprm->file, + (char __user *)text_addr, ex.a_text+ex.a_data, &pos); if ((signed long)error < 0) { send_sig(SIGKILL, current, 0); @@ -366,7 +367,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) down_write(¤t->mm->mmap_sem); do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); up_write(¤t->mm->mmap_sem); - bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex), + bprm->file->f_op->read(bprm->file, + (char __user *)N_TXTADDR(ex), ex.a_text+ex.a_data, &pos); flush_icache_range((unsigned long) N_TXTADDR(ex), (unsigned long) N_TXTADDR(ex) + @@ -477,7 +479,7 @@ static int load_aout_library(struct file *file) do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); up_write(¤t->mm->mmap_sem); - file->f_op->read(file, (char *)start_addr, + file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); flush_icache_range((unsigned long) start_addr, (unsigned long) start_addr + ex.a_text + ex.a_data); -- cgit v1.2.3 From 2b94ab2fd55768030e177a6ec224dedd031ad06b Mon Sep 17 00:00:00 2001 From: Fernando Luis V�zquez Cao <fernando@oss.ntt.co.jp> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Replace local_save_flags+local_irq_disable with The combination of "local_save_flags" and "local_irq_disable" seems to be equivalent to "local_irq_save" (see code snips below). Consequently, replace occurrences of local_save_flags+local_irq_disable with local_irq_save. * local_irq_save #define raw_local_irq_save(flags) \ do { (flags) = __raw_local_irq_save(); } while (0) static inline unsigned long __raw_local_irq_save(void) { unsigned long flags = __raw_local_save_flags(); raw_local_irq_disable(); return flags; } * local_save_flags #define raw_local_save_flags(flags) \ do { (flags) = __raw_local_save_flags(); } while (0) Signed-off-by: Fernando Vazquez <fernando@intellilink.co.jp> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 3 +-- arch/x86_64/kernel/genapic_flat.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 660602c51f51..b95d1e378b96 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -468,8 +468,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); return 0; diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 2ceb8b872b57..50ad153eaac4 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -49,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) unsigned long cfg; unsigned long flags; - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); /* * Wait for idle. -- cgit v1.2.3 From 2ade2920dcefdf5595c6380ebed131c964190855 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] i386/x86-64: rename is_at_popf(), add iret to tests and fix is_at_popf() needs to test for the iret instruction as well as popf. So add that test and rename it to is_setting_trap_flag(). Also change max insn length from 16 to 15 to match reality. LAHF / SAHF can't affect TF, so the comment in x86_64 is removed. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/ptrace.c | 10 +++++----- arch/x86_64/kernel/ptrace.c | 12 +++++------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index d3db03f4085d..775f50e9395b 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -185,17 +185,17 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ return addr; } -static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_eip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* opcode and address size prefixes */ case 0x66: case 0x67: @@ -247,7 +247,7 @@ static void set_singlestep(struct task_struct *child) * don't mark it as being "us" that set it, so that we * won't clear it by hand later. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 412c6a8bf906..addc14af0c56 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r return addr; } -static int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_rip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* CHECKME: 64 65 */ @@ -189,10 +189,8 @@ static void set_singlestep(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. - * - * AK: this is not enough, LAHF and IRET can change TF in user space too. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; -- cgit v1.2.3 From de684652f34f57cb60d4d78d09139a0e0c5e7b1b Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] print whether CONFIG_IOMMU_DEBUG is enabled Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index caf84e4e4ca9..ebe4e930b64d 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -127,15 +127,19 @@ static void tce_cache_blast(struct iommu_table *tbl); /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG +int debugging __read_mostly = 1; + static inline void tce_cache_blast_stress(struct iommu_table *tbl) { tce_cache_blast(tbl); } -#else +#else /* debugging is disabled */ +int debugging __read_mostly = 0; + static inline void tce_cache_blast_stress(struct iommu_table *tbl) { } -#endif /* BLAST_TCE_CACHE_ON_UNMAP */ +#endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) { @@ -944,8 +948,10 @@ void __init detect_calgary(void) if (calgary_found) { iommu_detected = 1; calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " - "TCE table spec is %d.\n", specified_table_size); + printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " + "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, + debugging ? "enabled" : "disabled"); } return; -- cgit v1.2.3 From 796e4390e0378e1e57c033349610cfc741696a3d Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] only verify the allocation bitmap if CONFIG_IOMMU_DEBUG is on Introduce new function verify_bit_range(). Define two versions, one for CONFIG_IOMMU_DEBUG enabled and one for disabled. Previously we were checking that the bitmap was consistent every time we allocated or freed an entry in the TCE table, which is good for debugging but incurs an unnecessary penalty on non debug builds. Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 44 ++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index ebe4e930b64d..7c43cb0f71a3 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -133,12 +133,35 @@ static inline void tce_cache_blast_stress(struct iommu_table *tbl) { tce_cache_blast(tbl); } + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + unsigned long idx = start; + + BUG_ON(start >= end); + + while (idx < end) { + if (!!test_bit(idx, bitmap) != expected) + return idx; + ++idx; + } + + /* all bits have the expected value */ + return ~0UL; +} #else /* debugging is disabled */ int debugging __read_mostly = 0; static inline void tce_cache_blast_stress(struct iommu_table *tbl) { } + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + return ~0UL; +} #endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) @@ -162,6 +185,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; + unsigned long badbit; index = start_addr >> PAGE_SHIFT; @@ -173,14 +197,15 @@ static void iommu_range_reserve(struct iommu_table *tbl, if (end > tbl->it_size) /* don't go off the table */ end = tbl->it_size; - while (index < end) { - if (test_bit(index, tbl->it_map)) + badbit = verify_bit_range(tbl->it_map, 0, index, end); + if (badbit != ~0UL) { + if (printk_ratelimit()) printk(KERN_ERR "Calgary: entry already allocated at " "0x%lx tbl %p dma 0x%lx npages %u\n", - index, tbl, start_addr, npages); - ++index; + badbit, tbl, start_addr, npages); } - set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages); + + set_bit_string(tbl->it_map, index, npages); } static unsigned long iommu_range_alloc(struct iommu_table *tbl, @@ -247,7 +272,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long i; + unsigned long badbit; entry = dma_addr >> PAGE_SHIFT; @@ -255,11 +280,12 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, tce_free(tbl, entry, npages); - for (i = 0; i < npages; ++i) { - if (!test_bit(entry + i, tbl->it_map)) + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); + if (badbit != ~0UL) { + if (printk_ratelimit()) printk(KERN_ERR "Calgary: bit is off at 0x%lx " "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - entry + i, tbl, dma_addr, entry, npages); + badbit, tbl, dma_addr, entry, npages); } __clear_bit_string(tbl->it_map, entry, npages); -- cgit v1.2.3 From 4ccf4ae3144360ab9c00d0b53427f43369287bfb Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] remove tce_cache_blast_stress() tce_cache_blast_stress was useful during bringup to stress the IOMMU's cache flushing. Now that we quiesce DMAs on every cache flush, using _stress() brings the machine down to its knees once you put it under load. Remove this debug / bringup code that isn't useful anymore completely. Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 7c43cb0f71a3..cd866135e4ef 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -129,11 +129,6 @@ static void tce_cache_blast(struct iommu_table *tbl); #ifdef CONFIG_IOMMU_DEBUG int debugging __read_mostly = 1; -static inline void tce_cache_blast_stress(struct iommu_table *tbl) -{ - tce_cache_blast(tbl); -} - static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) { @@ -153,10 +148,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, #else /* debugging is disabled */ int debugging __read_mostly = 0; -static inline void tce_cache_blast_stress(struct iommu_table *tbl) -{ -} - static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) { @@ -289,8 +280,6 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } __clear_bit_string(tbl->it_map, entry, npages); - - tce_cache_blast_stress(tbl); } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, -- cgit v1.2.3 From 4ea8a5d8b57cd504b4b2de1212523848e7ab50cf Mon Sep 17 00:00:00 2001 From: Muli Ben-Yehuda <muli@il.ibm.com> Date: Tue, 26 Sep 2006 10:52:33 +0200 Subject: [PATCH] Calgary IOMMU: eradicate sole remaining 80 chars per line offender Signed-off-by: Muli Ben-Yehuda <muli@il.ibm.com> Signed-off-by: Jon Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/pci-calgary.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index cd866135e4ef..466588f95601 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -86,7 +86,8 @@ #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ #define MAX_NUM_CHASSIS 8 /* max number of chassis */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) /* max dev->bus->number */ +/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) #define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ -- cgit v1.2.3 From 5a1b3999d6cb7ab87f1f3b1700bc91839fd6fa29 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: Some preparationary cleanup for stack trace - Remove unused all_contexts parameter No caller used it - Move skip argument into the structure (needed for followon patches) Cc: mingo@elte.hu Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/stacktrace.c | 11 +++-------- arch/s390/kernel/stacktrace.c | 17 ++++++++--------- arch/x86_64/kernel/stacktrace.c | 14 +++++--------- include/linux/stacktrace.h | 7 ++++--- kernel/lockdep.c | 5 ++++- 5 files changed, 24 insertions(+), 30 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/stacktrace.c b/arch/i386/kernel/stacktrace.c index e62a037ab399..ae3c32a87add 100644 --- a/arch/i386/kernel/stacktrace.c +++ b/arch/i386/kernel/stacktrace.c @@ -61,12 +61,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long ebp; unsigned long *stack = &ebp; @@ -85,10 +81,9 @@ void save_stack_trace(struct stack_trace *trace, struct thread_info *context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = save_context_stack(trace, skip, context, stack, ebp); + ebp = save_context_stack(trace, trace->skip, context, stack, ebp); stack = (unsigned long *)context->previous_esp; - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + if (!stack || trace->nr_entries >= trace->max_entries) break; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index de83f38288d0..d9428a0fc8fb 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -59,9 +59,7 @@ static inline unsigned long save_context_stack(struct stack_trace *trace, } } -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { register unsigned long sp asm ("15"); unsigned long orig_sp; @@ -69,22 +67,23 @@ void save_stack_trace(struct stack_trace *trace, sp &= PSW_ADDR_INSN; orig_sp = sp; - sp = save_context_stack(trace, &skip, sp, + sp = save_context_stack(trace, &trace->skip, sp, S390_lowcore.panic_stack - PAGE_SIZE, S390_lowcore.panic_stack); - if ((sp != orig_sp) && !all_contexts) + if ((sp != orig_sp) && !trace->all_contexts) return; - sp = save_context_stack(trace, &skip, sp, + sp = save_context_stack(trace, &trace->skip, sp, S390_lowcore.async_stack - ASYNC_SIZE, S390_lowcore.async_stack); - if ((sp != orig_sp) && !all_contexts) + if ((sp != orig_sp) && !trace->all_contexts) return; if (task) - save_context_stack(trace, &skip, sp, + save_context_stack(trace, &trace->skip, sp, (unsigned long) task_stack_page(task), (unsigned long) task_stack_page(task) + THREAD_SIZE); else - save_context_stack(trace, &skip, sp, S390_lowcore.thread_info, + save_context_stack(trace, &trace->skip, sp, + S390_lowcore.thread_info, S390_lowcore.thread_info + THREAD_SIZE); return; } diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 32cf55eb9af8..1c022af8fe1e 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -109,9 +109,10 @@ out_restore: * Save stack-backtrace addresses into a stack_trace buffer: */ static inline unsigned long -save_context_stack(struct stack_trace *trace, unsigned int skip, +save_context_stack(struct stack_trace *trace, unsigned long stack, unsigned long stack_end) { + int skip = trace->skip; unsigned long addr; #ifdef CONFIG_FRAME_POINTER @@ -159,12 +160,8 @@ save_context_stack(struct stack_trace *trace, unsigned int skip, /* * Save stack-backtrace addresses into a stack_trace buffer. - * If all_contexts is set, all contexts (hardirq, softirq and process) - * are saved. If not set then only the current context is saved. */ -void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip) +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { unsigned long stack = (unsigned long)&stack; int i, nr_stacks = 0, stacks_done[MAX_STACKS]; @@ -207,9 +204,8 @@ void save_stack_trace(struct stack_trace *trace, return; stacks_done[nr_stacks] = stack_end; - stack = save_context_stack(trace, skip, stack, stack_end); - if (!all_contexts || !stack || - trace->nr_entries >= trace->max_entries) + stack = save_context_stack(trace, stack, stack_end); + if (!stack || trace->nr_entries >= trace->max_entries) return; trace->entries[trace->nr_entries++] = ULONG_MAX; if (trace->nr_entries >= trace->max_entries) diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 9cc81e572224..50e2b01e517c 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -5,15 +5,16 @@ struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; + int skip; /* input argument: How many entries to skip */ + int all_contexts; /* input argument: if true do than one stack */ }; extern void save_stack_trace(struct stack_trace *trace, - struct task_struct *task, int all_contexts, - unsigned int skip); + struct task_struct *task); extern void print_stack_trace(struct stack_trace *trace, int spaces); #else -# define save_stack_trace(trace, task, all, skip) do { } while (0) +# define save_stack_trace(trace, task) do { } while (0) # define print_stack_trace(trace) do { } while (0) #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9bad17884513..900b4cb1a024 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -224,7 +224,10 @@ static int save_trace(struct stack_trace *trace) trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; trace->entries = stack_trace + nr_stack_trace_entries; - save_stack_trace(trace, NULL, 0, 3); + trace->skip = 3; + trace->all_contexts = 0; + + save_stack_trace(trace, NULL); trace->max_entries = trace->nr_entries; -- cgit v1.2.3 From b7f5e3c7742d5332b78b831131f43fc3630e6322 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Don't access the APIC in safe_smp_processor_id when it is not mapped yet Lockdep can call the dwarf2 unwinder early, and the dwarf2 code uses safe_smp_processor_id which tries to access the local APIC page. But that doesn't work before the APIC code has set up its fixmap. Check for this case and always return boot cpu then. Cc: jbeulich@novell.com Cc: mingo@elte.hu Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 2 ++ arch/x86_64/kernel/smp.c | 2 +- include/asm-x86_64/apic.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index b95d1e378b96..6472e321cad7 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -38,6 +38,7 @@ #include <asm/timex.h> #include <asm/apic.h> +int apic_mapped; int apic_verbosity; int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; @@ -600,6 +601,7 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); /* diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 06af6ca60129..2df988bbf312 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -526,7 +526,7 @@ int safe_smp_processor_id(void) { unsigned apicid, i; - if (disable_apic) + if (disable_apic || !apic_mapped) return 0; apicid = hard_smp_processor_id(); diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 29ee735278f2..9e66d32330c9 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -18,6 +18,7 @@ extern int apic_verbosity; extern int apic_runs_main_timer; extern int ioapic_force; +extern int apic_mapped; /* * Define the default level of output to be very little -- cgit v1.2.3 From c0b766f13d8e1189ce4d00e54700c9d96b543b9a Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Merge stacktrace and show_trace This unifies the standard backtracer and the new stacktrace in memory backtracer. The standard one is converted to use callbacks and then reimplement stacktrace using new callbacks. The main advantage is that stacktrace can now use the new dwarf2 unwinder and avoid false positives in many cases. I kept it simple to make sure the standard backtracer stays reliable. Cc: mingo@elte.hu Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/stacktrace.c | 214 +++++----------------------------------- arch/x86_64/kernel/traps.c | 99 ++++++++++++++----- include/asm-x86_64/stacktrace.h | 18 ++++ 3 files changed, 120 insertions(+), 211 deletions(-) create mode 100644 include/asm-x86_64/stacktrace.h (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c index 1c022af8fe1e..6026b31d037e 100644 --- a/arch/x86_64/kernel/stacktrace.c +++ b/arch/x86_64/kernel/stacktrace.c @@ -7,211 +7,49 @@ */ #include <linux/sched.h> #include <linux/stacktrace.h> +#include <linux/module.h> +#include <asm/stacktrace.h> -#include <asm/smp.h> - -static inline int -in_range(unsigned long start, unsigned long addr, unsigned long end) +static void save_stack_warning(void *data, char *msg) { - return addr >= start && addr <= end; } -static unsigned long -get_stack_end(struct task_struct *task, unsigned long stack) +static void +save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) { - unsigned long stack_start, stack_end, flags; - int i, cpu; - - /* - * The most common case is that we are in the task stack: - */ - stack_start = (unsigned long)task->thread_info; - stack_end = stack_start + THREAD_SIZE; - - if (in_range(stack_start, stack, stack_end)) - return stack_end; - - /* - * We are in an interrupt if irqstackptr is set: - */ - raw_local_irq_save(flags); - cpu = safe_smp_processor_id(); - stack_end = (unsigned long)cpu_pda(cpu)->irqstackptr; - - if (stack_end) { - stack_start = stack_end & ~(IRQSTACKSIZE-1); - if (in_range(stack_start, stack, stack_end)) - goto out_restore; - /* - * We get here if we are in an IRQ context but we - * are also in an exception stack. - */ - } - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (i = 0; i < N_EXCEPTION_STACKS; i++) { - /* - * set 'end' to the end of the exception stack. - */ - stack_end = per_cpu(init_tss, cpu).ist[i]; - stack_start = stack_end - EXCEPTION_STKSZ; - - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= stack_end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= stack_start) - goto out_restore; - - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (i == DEBUG_STACK - 1 && stack >= stack_end - DEBUG_STKSZ) { - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - stack_end -= EXCEPTION_STKSZ; - stack_start -= EXCEPTION_STKSZ; - } while (stack < stack_start); - - goto out_restore; - } -#endif - } - /* - * Ok, 'stack' is not pointing to any of the system stacks. - */ - stack_end = 0; - -out_restore: - raw_local_irq_restore(flags); - - return stack_end; } - -/* - * Save stack-backtrace addresses into a stack_trace buffer: - */ -static inline unsigned long -save_context_stack(struct stack_trace *trace, - unsigned long stack, unsigned long stack_end) +static int save_stack_stack(void *data, char *name) { - int skip = trace->skip; - unsigned long addr; - -#ifdef CONFIG_FRAME_POINTER - unsigned long prev_stack = 0; + struct stack_trace *trace = (struct stack_trace *)data; + return trace->all_contexts ? 0 : -1; +} - while (in_range(prev_stack, stack, stack_end)) { - pr_debug("stack: %p\n", (void *)stack); - addr = (unsigned long)(((unsigned long *)stack)[1]); - pr_debug("addr: %p\n", (void *)addr); - if (!skip) - trace->entries[trace->nr_entries++] = addr-1; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - if (!addr) - return 0; - /* - * Stack frames must go forwards (otherwise a loop could - * happen if the stackframe is corrupted), so we move - * prev_stack forwards: - */ - prev_stack = stack; - stack = (unsigned long)(((unsigned long *)stack)[0]); - } - pr_debug("invalid: %p\n", (void *)stack); -#else - while (stack < stack_end) { - addr = ((unsigned long *)stack)[0]; - stack += sizeof(long); - if (__kernel_text_address(addr)) { - if (!skip) - trace->entries[trace->nr_entries++] = addr-1; - else - skip--; - if (trace->nr_entries >= trace->max_entries) - break; - } +static void save_stack_address(void *data, unsigned long addr) +{ + struct stack_trace *trace = (struct stack_trace *)data; + if (trace->skip > 0) { + trace->skip--; + return; } -#endif - return stack; + if (trace->nr_entries < trace->max_entries - 1) + trace->entries[trace->nr_entries++] = addr; } -#define MAX_STACKS 10 +static struct stacktrace_ops save_stack_ops = { + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, +}; /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace, struct task_struct *task) { - unsigned long stack = (unsigned long)&stack; - int i, nr_stacks = 0, stacks_done[MAX_STACKS]; - - WARN_ON(trace->nr_entries || !trace->max_entries); - - if (!task) - task = current; - - pr_debug("task: %p, ti: %p\n", task, task->thread_info); - - if (!task || task == current) { - /* Grab rbp right from our regs: */ - asm ("mov %%rbp, %0" : "=r" (stack)); - pr_debug("rbp: %p\n", (void *)stack); - } else { - /* rbp is the last reg pushed by switch_to(): */ - stack = task->thread.rsp; - pr_debug("other task rsp: %p\n", (void *)stack); - stack = (unsigned long)(((unsigned long *)stack)[0]); - pr_debug("other task rbp: %p\n", (void *)stack); - } - - while (1) { - unsigned long stack_end = get_stack_end(task, stack); - - pr_debug("stack: %p\n", (void *)stack); - pr_debug("stack end: %p\n", (void *)stack_end); - - /* - * Invalid stack addres? - */ - if (!stack_end) - return; - /* - * Were we in this stack already? (recursion) - */ - for (i = 0; i < nr_stacks; i++) - if (stacks_done[i] == stack_end) - return; - stacks_done[nr_stacks] = stack_end; - - stack = save_context_stack(trace, stack, stack_end); - if (!stack || trace->nr_entries >= trace->max_entries) - return; - trace->entries[trace->nr_entries++] = ULONG_MAX; - if (trace->nr_entries >= trace->max_entries) - return; - if (++nr_stacks >= MAX_STACKS) - return; - } + dump_trace(task, NULL, NULL, &save_stack_ops, trace); + trace->entries[trace->nr_entries++] = ULONG_MAX; } +EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 9ec2b1d5893d..4ac18b02eada 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -45,6 +45,7 @@ #include <asm/pda.h> #include <asm/proto.h> #include <asm/nmi.h> +#include <asm/stacktrace.h> asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -142,7 +143,7 @@ void printk_address(unsigned long address) #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, const char **idp) + unsigned *usedp, char **idp) { static char ids[][8] = { [DEBUG_STACK - 1] = "#DB", @@ -234,13 +235,19 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } -static int show_trace_unwind(struct unwind_frame_info *info, void *context) +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static int dump_trace_unwind(struct unwind_frame_info *info, void *context) { + struct ops_and_data *oad = (struct ops_and_data *)context; int n = 0; while (unwind(info) == 0 && UNW_PC(info)) { n++; - printk_address(UNW_PC(info)); + oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; } @@ -254,45 +261,51 @@ static int show_trace_unwind(struct unwind_frame_info *info, void *context) * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack) +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, + struct stacktrace_ops *ops, void *data) { const unsigned cpu = safe_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; - printk("\nCall Trace:\n"); - if (!tsk) tsk = current; if (call_trace >= 0) { int unw_ret = 0; struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; if (regs) { if (unwind_init_frame_info(&info, tsk, regs) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } else if (tsk == current) - unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); else { if (unwind_init_blocked(&info, tsk) == 0) - unw_ret = show_trace_unwind(&info, NULL); + unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - print_symbol("DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if ((long)UNW_SP(&info) < 0) { - printk("Leftover inexact backtrace:\n"); + ops->warning(data, "Leftover inexact backtrace:\n"); stack = (unsigned long *)UNW_SP(&info); } else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - printk("Full inexact backtrace again:\n"); + ops->warning(data, "Full inexact backtrace again:\n"); } else - printk("Inexact backtrace:\n"); + ops->warning(data, "Inexact backtrace:\n"); + } + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (tsk && tsk != current) + stack = (unsigned long *)tsk->thread.rsp; } /* @@ -312,7 +325,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - printk_address(addr); \ + ops->address(data, addr); \ } \ } while (0) @@ -321,16 +334,17 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s * current stack address. If the stacks consist of nested * exceptions */ - for ( ; ; ) { - const char *id; + for (;;) { + char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - printk(" <%s>", id); + if (ops->stack(data, id) < 0) + break; HANDLE_STACK (stack < estack_end); - printk(" <EOE>"); + ops->stack(data, "<EOE>"); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the @@ -345,7 +359,8 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - printk(" <IRQ>"); + if (ops->stack(data, "IRQ") < 0) + break; HANDLE_STACK (stack < irqstack_end); /* * We link to the next stack (which would be @@ -354,7 +369,7 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - printk(" <EOI>"); + ops->stack(data, "EOI"); continue; } } @@ -362,15 +377,53 @@ void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s } /* - * This prints the process stack: + * This handles the process stack: */ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK +} +EXPORT_SYMBOL(dump_trace); +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s\n", msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +static void print_trace_address(void *data, unsigned long addr) +{ + printk_address(addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +{ + printk("\nCall Trace:\n"); + dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); } -static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) +static void +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; diff --git a/include/asm-x86_64/stacktrace.h b/include/asm-x86_64/stacktrace.h new file mode 100644 index 000000000000..5eb9799bef76 --- /dev/null +++ b/include/asm-x86_64/stacktrace.h @@ -0,0 +1,18 @@ +#ifndef _ASM_STACKTRACE_H +#define _ASM_STACKTRACE_H 1 + +/* Generic stack tracer with callbacks */ + +struct stacktrace_ops { + void (*warning)(void *data, char *msg); + /* msg must contain %s for the symbol */ + void (*warning_symbol)(void *data, char *msg, unsigned long symbol); + void (*address)(void *data, unsigned long address); + /* On negative return stop dumping */ + int (*stack)(void *data, char *name); +}; + +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, + struct stacktrace_ops *ops, void *data); + +#endif -- cgit v1.2.3 From be7a91709b90825990e571b2f20cea937d5eef6c Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] Check for end of stack trace before falling back Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 4ac18b02eada..28e53342f294 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -292,6 +292,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if ((long)UNW_SP(&info) < 0) { ops->warning(data, "Leftover inexact backtrace:\n"); stack = (unsigned long *)UNW_SP(&info); + if (!stack) + return; } else ops->warning(data, "Full inexact backtrace again:\n"); } else if (call_trace >= 1) -- cgit v1.2.3 From d28c4393a7bf558538e9def269c1caeab6ec056f Mon Sep 17 00:00:00 2001 From: "Prasanna S.P" <prasanna@in.ibm.com> Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] x86: error_code is not safe for kprobes This patch moves the entry.S:error_entry to .kprobes.text section, since code marked unsafe for kprobes jumps directly to entry.S::error_entry, that must be marked unsafe as well. This patch also moves all the ".previous.text" asm directives to ".previous" for kprobes section. AK: Following a similar i386 patch from Chuck Ebbert AK: Also merged Jeremy's fix in. +From: Jeremy Fitzhardinge <jeremy@goop.org> KPROBE_ENTRY does a .section .kprobes.text, and expects its users to do a .previous at the end of the function. Unfortunately, if any code within the function switches sections, for example .fixup, then the .previous ends up putting all subsequent code into .fixup. Worse, any subsequent .fixup code gets intermingled with the code its supposed to be fixing (which is also in .fixup). It's surprising this didn't cause more havok. The fix is to use .pushsection/.popsection, so this stuff nests properly. A further cleanup would be to get rid of all .section/.previous pairs, since they're inherently fragile. +From: Chuck Ebbert <76306.1226@compuserve.com> Because code marked unsafe for kprobes jumps directly to entry.S::error_code, that must be marked unsafe as well. The easiest way to do that is to move the page fault entry point to just before error_code and let it inherit the same section. Also moved all the ".previous" asm directives for kprobes sections to column 1 and removed ".text" from them. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/entry.S | 25 +++++++++++++------------ arch/x86_64/kernel/entry.S | 19 +++++++------------ include/linux/linkage.h | 6 +++++- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 87f9f60b803b..ba22ec8fab54 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -591,11 +591,9 @@ ENTRY(name) \ /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -ENTRY(divide_error) - RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: @@ -645,6 +643,7 @@ error_code: call *%edi jmp ret_from_exception CFI_ENDPROC +KPROBE_END(page_fault) ENTRY(coprocessor_error) RING0_INT_FRAME @@ -720,7 +719,8 @@ debug_stack_correct: call do_debug jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(debug) + /* * NMI is doubly nasty. It can happen _while_ we're handling * a debug fault, and the debug fault hasn't yet been able to @@ -816,7 +816,7 @@ KPROBE_ENTRY(int3) call do_int3 jmp ret_from_exception CFI_ENDPROC - .previous .text +KPROBE_END(int3) ENTRY(overflow) RING0_INT_FRAME @@ -881,7 +881,7 @@ KPROBE_ENTRY(general_protection) CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) RING0_EC_FRAME @@ -890,13 +890,14 @@ ENTRY(alignment_check) jmp error_code CFI_ENDPROC -KPROBE_ENTRY(page_fault) - RING0_EC_FRAME - pushl $do_page_fault +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC - .previous .text #ifdef CONFIG_X86_MCE ENTRY(machine_check) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2092f565aa87..780f9b26169f 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -819,7 +819,7 @@ paranoid_schedule\trace: * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. */ -ENTRY(error_entry) +KPROBE_ENTRY(error_entry) _frame RDI /* rdi slot contains rax, oldrax contains error code */ cld @@ -903,7 +903,7 @@ error_kernelspace: cmpq $gs_change,RIP(%rsp) je error_swapgs jmp error_sti -END(error_entry) +KPROBE_END(error_entry) /* Reload gs selector with exception handling */ /* edi: new selector */ @@ -1025,8 +1025,7 @@ ENDPROC(execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault -END(page_fault) - .previous .text +KPROBE_END(page_fault) ENTRY(coprocessor_error) zeroentry do_coprocessor_error @@ -1047,8 +1046,7 @@ KPROBE_ENTRY(debug) CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK paranoidexit -END(debug) - .previous .text +KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) @@ -1062,8 +1060,7 @@ KPROBE_ENTRY(nmi) jmp paranoid_exit1 CFI_ENDPROC #endif -END(nmi) - .previous .text +KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME @@ -1072,8 +1069,7 @@ KPROBE_ENTRY(int3) paranoidentry do_int3, DEBUG_STACK jmp paranoid_exit1 CFI_ENDPROC -END(int3) - .previous .text +KPROBE_END(int3) ENTRY(overflow) zeroentry do_overflow @@ -1121,8 +1117,7 @@ END(stack_segment) KPROBE_ENTRY(general_protection) errorentry do_general_protection -END(general_protection) - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) errorentry do_alignment_check diff --git a/include/linux/linkage.h b/include/linux/linkage.h index 932021f872d5..6c9873f88287 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -35,9 +35,13 @@ #endif #define KPROBE_ENTRY(name) \ - .section .kprobes.text, "ax"; \ + .pushsection .kprobes.text, "ax"; \ ENTRY(name) +#define KPROBE_END(name) \ + END(name); \ + .popsection + #ifndef END #define END(name) \ .size name, .-name -- cgit v1.2.3 From cbf9b4bb76c9ce53b7fdde0dffcd000951b5f0d4 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich <sivanich@sgi.com> Date: Tue, 26 Sep 2006 10:52:34 +0200 Subject: [PATCH] X86_64 monotonic_clock goes backwards I've noticed some erratic behavior while testing the X86_64 version of monotonic_clock(). While spinning in a loop reading monotonic clock values (pinned to a single cpu) I noticed that the difference between subsequent values occasionally went negative (time going backwards). I found that in the following code: this_offset = get_cycles_sync(); /* FIXME: 1000 or 1000000? */ --> offset = (this_offset - last_offset)*1000 / cpu_khz; } return base + offset; the offset sometimes turns out to be 0, even though this_offset > last_offset. +Added fix From: Toyo Abe <toyoa@mvista.com> The x86_64-mm-monotonic-clock.patch in 2.6.18-rc4-mm2 made a change to the updating of monotonic_base. It now uses cycles_2_ns(). I suggest that a set_cyc2ns_scale() should be done prior to the setup_irq(). Because cycles_2_ns() can be called from the timer ISR right after the irq0 is enabled. Signed-off-by: Toyo Abe <toyoa@mvista.com> Signed-off-by: Dimitri Sivanich <sivanich@sgi.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/time.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index d66c7f750e75..97115e608ed8 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -276,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime) * Note: This function is required to return accurate * time even in the absence of multiple timer ticks. */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc); unsigned long long monotonic_clock(void) { unsigned long seq; @@ -300,8 +301,7 @@ unsigned long long monotonic_clock(void) base = monotonic_base; } while (read_seqretry(&xtime_lock, seq)); this_offset = get_cycles_sync(); - /* FIXME: 1000 or 1000000? */ - offset = (this_offset - last_offset)*1000 / cpu_khz; + offset = cycles_2_ns(this_offset - last_offset); } return base + offset; } @@ -405,8 +405,7 @@ void main_timer_handler(struct pt_regs *regs) offset %= USEC_PER_TICK; } - /* FIXME: 1000 or 1000000? */ - monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz; + monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; @@ -929,10 +928,8 @@ void __init time_init(void) vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); - setup_irq(0, &irq0); - set_cyc2ns_scale(cpu_khz); - + setup_irq(0, &irq0); hotcpu_notifier(time_cpu_notifier, 0); time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); -- cgit v1.2.3 From 1edf777803bdd2aeeb04cf44508fd9b88737fba8 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] i386/x86-64: Improve Kconfig description of CRASH_DUMP Improve Kconfig description of CONFIG_CRASH_DUMP. Previously it was too brief to be useful. Cc: vgoyal@in.ibm.com Cc: ebiederm@xmission.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/Kconfig | 7 +++++++ arch/x86_64/Kconfig | 9 ++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index c134a545b37e..e34a4dbd6b92 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -761,6 +761,13 @@ config CRASH_DUMP depends on HIGHMEM help Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into + a specially reserved region and then later executed after + a crash by kdump/kexec. The crash dump kernel must be compiled + to a memory address not used by the main kernel or BIOS using + PHYSICAL_START. + For more details see Documentation/kdump/kdump.txt config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 6cd4878625f1..927b392a4aa0 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -488,7 +488,14 @@ config CRASH_DUMP bool "kernel crash dumps (EXPERIMENTAL)" depends on EXPERIMENTAL help - Generate crash dump after being started by kexec. + Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into + a specially reserved region and then later executed after + a crash by kdump/kexec. The crash dump kernel must be compiled + to a memory address not used by the main kernel or BIOS using + PHYSICAL_START. + For more details see Documentation/kdump/kdump.txt config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) -- cgit v1.2.3 From 2717941c6a1d8fa543ddca337d450ab30ef91543 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] Make boot_param_data pure BSS Since it's all zero. Actually I think gcc 4+ will do that automatically, but earlier compilers won't Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 646caa0a20bb..b09e60fa96b4 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -24,7 +24,7 @@ #include <asm/proto.h> #include <asm/sections.h> -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; +char x86_boot_params[BOOT_PARAM_SIZE] __initdata; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -- cgit v1.2.3 From 273819a2d982faace30e587b86a0683882251fe7 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] make fault notifier unconditional and export it It's needed for external debuggers and overhead is very small. Also make the actual notifier chain they use static Cc: jbeulich@novell.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/mm/fault.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 449a437f5d70..3ae3e7bb2fce 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -40,8 +40,7 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -#ifdef CONFIG_KPROBES -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); /* Hook to register for page fault notifications */ int register_page_fault_notifier(struct notifier_block *nb) @@ -49,11 +48,13 @@ int register_page_fault_notifier(struct notifier_block *nb) vmalloc_sync_all(); return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(register_page_fault_notifier); int unregister_page_fault_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); } +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); static inline int notify_page_fault(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) @@ -67,13 +68,6 @@ static inline int notify_page_fault(enum die_val val, const char *str, }; return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); } -#else -static inline int notify_page_fault(enum die_val val, const char *str, - struct pt_regs *regs, long err, int trap, int sig) -{ - return NOTIFY_DONE; -} -#endif void bust_spinlocks(int yes) { -- cgit v1.2.3 From ba4d40bb5c465f0a4dcc30d02dab80c2cb7e1ff3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] Auto size the per cpu area. Now for a completely different but trivial approach. I just boot tested it with 255 CPUS and everything worked. Currently everything (except module data) we place in the per cpu area we know about at compile time. So instead of allocating a fixed size for the per_cpu area allocate the number of bytes we need plus a fixed constant for to be used for modules. It isn't perfect but it is much less of a pain to work with than what we are doing now. AK: fixed warning Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup64.c | 7 ++----- include/asm-x86_64/percpu.h | 10 ++++++++++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index b09e60fa96b4..e85cfbb49b63 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -95,12 +95,9 @@ void __init setup_per_cpu_areas(void) #endif /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); for_each_cpu_mask (i, cpu_possible_map) { char *ptr; diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index 08dd9f9dda81..39d2bab9b520 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -11,6 +11,16 @@ #include <asm/pda.h> +#ifdef CONFIG_MODULES +# define PERCPU_MODULE_RESERVE 8192 +#else +# define PERCPU_MODULE_RESERVE 0 +#endif + +#define PERCPU_ENOUGH_ROOM \ + (ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \ + PERCPU_MODULE_RESERVE) + #define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset) #define __my_cpu_offset() read_pda(data_offset) -- cgit v1.2.3 From 40bee2ee73c745922e9b2d5595c46f19d1cf1b6f Mon Sep 17 00:00:00 2001 From: Brice Goglin <brice@myri.com> Date: Tue, 26 Sep 2006 10:52:35 +0200 Subject: [PATCH] fix bus numbering format in mmconfig warning Make an mmconfig warning print the bus id with a regular format. Signed-off-by: Brice Goglin <brice@myri.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> --- arch/x86_64/pci/mmconfig.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 3c55c76c6fd5..dd78f4d18df1 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -156,9 +156,8 @@ static __init void unreachable_devices(void) addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); if (addr == NULL|| readl(addr) != val1) { set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE - "PCI: No mmconfig possible on device %x:%x\n", - k, i); + printk(KERN_NOTICE "PCI: No mmconfig possible" + " on device %02x:%02x\n", k, i); } } } -- cgit v1.2.3 From e07e23e1fd3000289fc7ccc6c71879070d3b19e0 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven <arjan@linux.intel.com> Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] non lazy "sleazy" fpu implementation Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. [akpm@osdl.org: place new task_struct field next to jit_keyring to save space] Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> --- arch/x86_64/kernel/process.c | 10 ++++++++++ arch/x86_64/kernel/traps.c | 1 + include/asm-x86_64/i387.h | 5 ++++- include/linux/sched.h | 9 +++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 6fbd19564e4e..9e9a70e50c72 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -552,6 +552,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter>5) + prefetch(&next->i387.fxsave); + /* * Reload esp0, LDT and the page table pointer: */ @@ -629,6 +633,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) __switch_to_xtra(prev_p, next_p, tss); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter>5) + math_state_restore(); return prev_p; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 28e53342f294..ffc40cff1e07 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -1136,6 +1136,7 @@ asmlinkage void math_state_restore(void) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); task_thread_info(me)->status |= TS_USEDFPU; + me->fpu_counter++; } void __init trap_init(void) diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h index cba8a3b0cded..60c0f4853fdb 100644 --- a/include/asm-x86_64/i387.h +++ b/include/asm-x86_64/i387.h @@ -24,6 +24,7 @@ extern unsigned int mxcsr_feature_mask; extern void mxcsr_feature_mask_init(void); extern void init_fpu(struct task_struct *child); extern int save_i387(struct _fpstate __user *buf); +extern asmlinkage void math_state_restore(void); /* * FPU lazy state save handling... @@ -31,7 +32,9 @@ extern int save_i387(struct _fpstate __user *buf); #define unlazy_fpu(tsk) do { \ if (task_thread_info(tsk)->status & TS_USEDFPU) \ - save_init_fpu(tsk); \ + save_init_fpu(tsk); \ + else \ + tsk->fpu_counter = 0; \ } while (0) /* Ignore delayed exceptions from user space */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 34ed0d99b1bd..807556c5bcd2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -865,6 +865,15 @@ struct task_struct { struct key *thread_keyring; /* keyring private to this thread */ unsigned char jit_keyring; /* default keyring to attach requested keys to */ #endif + /* + * fpu_counter contains the number of consecutive context switches + * that the FPU is used. If this is over a threshold, the lazy fpu + * saving becomes unlazy to save the trap. This is an unsigned char + * so that after 256 times the counter wraps and the behavior turns + * lazy again; this to deal with bursty apps that only use FPU for + * a short time + */ + unsigned char fpu_counter; int oomkilladj; /* OOM kill score adjustment (bit shift). */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock -- cgit v1.2.3 From abf0f10948b316b577851ef21c728341f1046552 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@osdl.org> Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] wire up oops_enter()/oops_exit() Implement pause_on_oops() on x86_64. AK: I redid the patch to do the oops_enter/exit in the existing oops_begin()/end(). This makes it much shorter. Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/traps.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index ffc40cff1e07..34660b1e2720 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -561,6 +561,8 @@ unsigned __kprobes long oops_begin(void) int cpu = safe_smp_processor_id(); unsigned long flags; + oops_enter(); + /* racy, but better than risking deadlock. */ local_irq_save(flags); if (!spin_trylock(&die_lock)) { @@ -589,6 +591,7 @@ void __kprobes oops_end(unsigned long flags) spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) panic("Fatal exception"); + oops_exit(); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) -- cgit v1.2.3 From 6ad916581181a105d7832a7dec9e1eb58f7a1621 Mon Sep 17 00:00:00 2001 From: Keith Mannthey <kmannth@us.ibm.com> Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] x86_64 kernel mapping fix Fix for the x86_64 kernel mapping code. Without this patch the update path only inits one pmd_page worth of memory and tramples any entries on it. now the calling convention to phys_pmd_init and phys_init is to always pass a [pmd/pud] page not an offset within a page. Signed-off-by: Keith Mannthey<kmannth@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@osdl.org> --- arch/x86_64/mm/init.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index d40134bd6399..984155b75e4c 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -250,12 +250,13 @@ __init void early_iounmap(void *addr, unsigned long size) } static void __meminit -phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { - int i; + int i = pmd_index(address); - for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { unsigned long entry; + pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { if (!after_bootmem) @@ -263,6 +264,10 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) set_pmd(pmd, __pmd(0)); break; } + + if (pmd_val(*pmd)) + continue; + entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; entry &= __supported_pte_mask; set_pmd(pmd, __pmd(entry)); @@ -272,45 +277,41 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); - - if (pmd_none(*pmd)) { - spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); - spin_unlock(&init_mm.page_table_lock); - __flush_tlb_all(); - } + pmd_t *pmd = pmd_offset(pud,0); + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { - long i = pud_index(address); - - pud = pud + i; + int i = pud_index(addr); - if (after_bootmem && pud_val(*pud)) { - phys_pmd_update(pud, address, end); - return; - } - for (; i < PTRS_PER_PUD; pud++, i++) { + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { int map; - unsigned long paddr, pmd_phys; + unsigned long pmd_phys; + pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; - paddr = (address & PGDIR_MASK) + i*PUD_SIZE; - if (paddr >= end) + if (addr >= end) break; - if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { set_pud(pud, __pud(0)); continue; } + if (pud_val(*pud)) { + phys_pmd_update(pud, addr, end); + continue; + } + pmd = alloc_low_page(&map, &pmd_phys); spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, paddr, end); + phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } -- cgit v1.2.3 From ed77504b2007ff7ce56841227467ac3ead52df62 Mon Sep 17 00:00:00 2001 From: Magnus Damm <magnus@valinux.co.jp> Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] mark init_amd() as __cpuinit The init_amd() function is only called from identify_cpu() which is already marked as __cpuinit. So let's mark it as __cpuinit. Signed-off-by: Magnus Damm <magnus@valinux.co.jp> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 4be29832e3dc..e328e3eb8cbe 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -672,7 +672,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) #endif } -static void __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; -- cgit v1.2.3 From dbf9272e863bf4b17ee8e3c66c26682b2061d40d Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:36 +0200 Subject: [PATCH] Don't force reserve the 640k-1MB range From i386 x86-64 inherited code to force reserve the 640k-1MB area. That was needed on some old systems. But we generally trust the e820 map to be correct on 64bit systems and mark all areas that are not memory correctly. This patch will allow to use the real memory in there. Or rather the only way to find out if it's still needed is to try. So far I'm optimistic. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/e820.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 164d0b83dc92..e06c2714ecf3 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -71,12 +71,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } #endif - /* kernel code + 640k memory hole (later should not be needed, but - be paranoid for now) */ - if (last >= 640*1024 && addr < 1024*1024) { - *addrp = 1024*1024; - return 1; - } + /* kernel code */ if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { *addrp = __pa_symbol(&_end); return 1; @@ -519,13 +514,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * If we're lucky and live on a modern system, the setup code * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) */ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { @@ -543,25 +531,6 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) if (start > end) return -1; - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - * - * This should be removed on Hammer which is supposed to not - * have non e820 covered ISA mappings there, but I had some strange - * problems so it stays for now. -AK - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - add_memory_region(start, size, type); } while (biosmap++,--nr_map); return 0; -- cgit v1.2.3 From 1164c9994fe37d5b7035a5cf9328c98dd38af7b1 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@osdl.org> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] make numa_emulation() __init Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index d64d6d93d04b..322bf45fc36a 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -225,7 +225,7 @@ void __init numa_init_array(void) int numa_fake __initdata = 0; /* Numa emulation */ -static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { int i; struct bootnode nodes[MAX_NUMNODES]; -- cgit v1.2.3 From 34464a5b8937b79801776dfb6970c1b949fed4be Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" <rjw@sisk.pl> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Detect clock skew during suspend Detect the situations in which the time after a resume from disk would be earlier than the time before the suspend and prevent them from happening on x86_64. Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/time.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 97115e608ed8..9dd15d12b659 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -1039,8 +1039,16 @@ static int timer_resume(struct sys_device *dev) unsigned long flags; unsigned long sec; unsigned long ctime = get_cmos_time(); - unsigned long sleep_length = (ctime - sleep_start) * HZ; + long sleep_length = (ctime - sleep_start) * HZ; + if (sleep_length < 0) { + printk(KERN_WARNING "Time skew detected in timer resume!\n"); + /* The time after the resume must not be earlier than the time + * before the suspend or some nasty things will happen + */ + sleep_length = 0; + ctime = sleep_start; + } if (vxtime.hpet_address) hpet_reenable(); else -- cgit v1.2.3 From 151f8cc1169f9052095b2be36183ab132d75c6c2 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove safe_smp_processor_id() And replace all users with ordinary smp_processor_id. The function was originally added to get some basic oops information out even if the GS register was corrupted. However that didn't work for some anymore because printk is needed to print the oops and it uses smp_processor_id() already. Also GS register corruptions are not particularly common anymore. This also helps the Xen port which would otherwise need to do this in a special way because it can't access the local APIC. Cc: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/mce.c | 2 +- arch/x86_64/kernel/smp.c | 23 ----------------------- arch/x86_64/kernel/traps.c | 10 +++++----- include/asm-x86_64/smp.h | 2 -- 4 files changed, 6 insertions(+), 31 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 4e017fb30fb3..1a93c3738404 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) goto out2; memset(&m, 0, sizeof(struct mce)); - m.cpu = safe_smp_processor_id(); + m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 2df988bbf312..4f67697f5036 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -522,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void) } } -int safe_smp_processor_id(void) -{ - unsigned apicid, i; - - if (disable_apic || !apic_mapped) - return 0; - - apicid = hard_smp_processor_id(); - if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid) - return apicid; - - for (i = 0; i < NR_CPUS; ++i) { - if (x86_cpu_to_apicid[i] == apicid) - return i; - } - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return 0; /* Should not happen */ -} diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 34660b1e2720..38bc821e457b 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -264,7 +264,7 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, struct stacktrace_ops *ops, void *data) { - const unsigned cpu = safe_smp_processor_id(); + const unsigned cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; @@ -429,7 +429,7 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -483,7 +483,7 @@ void show_registers(struct pt_regs *regs) int i; int in_kernel = !user_mode(regs); unsigned long rsp; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; rsp = regs->rsp; @@ -558,7 +558,7 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { - int cpu = safe_smp_processor_id(); + int cpu = smp_processor_id(); unsigned long flags; oops_enter(); @@ -636,7 +636,7 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) * We are in trouble anyway, lets at least try * to get a message out. */ - printk(str, safe_smp_processor_id()); + printk(str, smp_processor_id()); show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 498fbc1fc179..58b5d6149a42 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -66,7 +66,6 @@ static inline int hard_smp_processor_id(void) return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); } -extern int safe_smp_processor_id(void); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void prefill_possible_map(void); @@ -100,7 +99,6 @@ static inline int cpu_present_to_apicid(int mps_cpu) #ifndef CONFIG_SMP #define stack_smp_processor_id() 0 -#define safe_smp_processor_id() 0 #define cpu_logical_map(x) (x) #else #include <asm/thread_info.h> -- cgit v1.2.3 From d3cf7f061521c78ad62e275eb6fbdc8f43fc75a7 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove bogus warning from early_ioremap It is correct for its only caller right now, but not for possible future others. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/mm/init.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 984155b75e4c..53b49c86d7e0 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -229,7 +229,6 @@ __init void *early_ioremap(unsigned long addr, unsigned long size) /* actually usually some more */ if (size >= LARGE_PAGE_SIZE) { - printk("SMBIOS area too long %lu\n", size); return NULL; } set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); -- cgit v1.2.3 From df992848f5aa803fcacd2c5e7d67034bb89e3fa3 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Fix pte_exec/mkexec and use it in change_page_attr() Fix the pte_exec/mkexec page table accessor functions to really use the NX bit. Previously they only checked the USER bit, but weren't actually used for anything. Then use them in change_page_attr() to manipulate the NX bit properly. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/mm/pageattr.c | 8 +++++--- include/asm-x86_64/pgtable.h | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 2685b1f3671c..01591b649f4e 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -190,10 +190,12 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) * lowmem */ if (__pa(address) < KERNEL_TEXT_SIZE) { unsigned long addr2; - pgprot_t prot2 = prot; + pgprot_t prot2; addr2 = __START_KERNEL_map + __pa(address); - pgprot_val(prot2) &= ~_PAGE_NX; - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); + /* Make sure the kernel mappings stay executable */ + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); + err = __change_page_attr(addr2, pfn, prot2, + PAGE_KERNEL_EXEC); } } up_write(&init_mm.mmap_sem); diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index d79e7441b513..f7614670c655 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -265,7 +265,7 @@ static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) static inline int pte_user(pte_t pte) { return pte_val(pte) & _PAGE_USER; } static inline int pte_read(pte_t pte) { return pte_val(pte) & _PAGE_USER; } -static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_USER; } +static inline int pte_exec(pte_t pte) { return !(pte_val(pte) & _PAGE_NX); } static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW; } @@ -278,7 +278,7 @@ static inline pte_t pte_mkclean(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & static inline pte_t pte_mkold(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_ACCESSED)); return pte; } static inline pte_t pte_wrprotect(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_RW)); return pte; } static inline pte_t pte_mkread(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_USER)); return pte; } -static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_USER)); return pte; } +static inline pte_t pte_mkexec(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_NX)); return pte; } static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; } static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } -- cgit v1.2.3 From 5e6b0bfe5b452957b7be4b6ef181cd41880f8359 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Use proper accessors to change PSE bits in change_page_attr() Use normal pte accessors in change_page_attr() to access the PSE bits. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/mm/pageattr.c | 16 ++++++---------- include/asm-x86_64/pgtable.h | 1 + 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 01591b649f4e..3e231d762aaa 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -108,8 +108,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pgprot_val(ref_prot) |= _PAGE_PSE; large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } @@ -119,32 +119,28 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, { pte_t *kpte; struct page *kpte_page; - unsigned kpte_flags; pgprot_t ref_prot2; kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - kpte_flags = pte_val(*kpte); if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if ((kpte_flags & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); } else { /* * split_large_page will take the reference for this * change_page_attr on the split page. */ - struct page *split; - ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); - + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; - set_pte(kpte,mk_pte(split, ref_prot2)); + set_pte(kpte, mk_pte(split, ref_prot2)); kpte_page = split; - } + } page_private(kpte_page)++; - } else if ((kpte_flags & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, ref_prot)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index f7614670c655..b34f43acdef1 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -283,6 +283,7 @@ static inline pte_t pte_mkdirty(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | static inline pte_t pte_mkyoung(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; } static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; } static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; } +static inline pte_t pte_clrhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_PSE)); return pte; } struct vm_area_struct; -- cgit v1.2.3 From f2c2cca3acef8b253a36381d9b469ad4fb08563a Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove APIC version/cpu capability mpparse checking/printing ACPI went to great trouble to get the APIC version and CPU capabilities of different CPUs before passing them to the mpparser. But all that data was used was to print it out. Actually it even faked some data based on the boot cpu, not on the actual CPU being booted. Remove all this code because it's not needed. Cc: len.brown@intel.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/io_apic.c | 13 ----------- arch/x86_64/kernel/mpparse.c | 52 +++++++++++++++----------------------------- 2 files changed, 17 insertions(+), 48 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index afac3dbb3729..0491019d4c8d 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1748,19 +1748,6 @@ device_initcall(ioapic_init_sysfs); #define IO_APIC_MAX_ID 0xFE -int __init io_apic_get_version (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index d63f849aea2c..32b6977f80c0 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -41,7 +41,6 @@ int acpi_found_madt; * Various Linux-internal data structures created from the * MP-table. */ -unsigned char apic_version [MAX_APICS]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; @@ -94,24 +93,21 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __cpuinit MP_processor_info (struct mpc_config_processor *m) { int cpu; - unsigned char ver; cpumask_t tmp_map; + char *bootup_cpu = ""; if (!(m->mpc_cpuflag & CPU_ENABLED)) { disabled_cpus++; return; } - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, - m->mpc_apicver); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); + bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; } + + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); + if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); @@ -129,17 +125,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) return; } #endif - ver = m->mpc_apicver; - physid_set(m->mpc_apicid, phys_cpu_present_map); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* * bios_cpu_apicid is required to have processors listed @@ -179,8 +165,8 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m) if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk("I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + printk("I/O APIC #%d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", MAX_IO_APICS, nr_ioapics); @@ -413,13 +399,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) * 2 CPUs, numbered 0 & 1. */ processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_apicver = 0; processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; for (i = 0; i < 2; i++) { @@ -448,7 +431,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_apicver = 0; ioapic.mpc_flags = MPC_APIC_USABLE; ioapic.mpc_apicaddr = 0xFEC00000; MP_ioapic_info(&ioapic); @@ -640,12 +623,11 @@ void __cpuinit mp_register_lapic (u8 id, u8 enabled) processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_apicver = 0; processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; @@ -700,7 +682,7 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].mpc_apicid = id; - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + mp_ioapics[idx].mpc_apicver = 0; /* * Build basic IRQ lookup table to facilitate gsi->io_apic lookups @@ -711,9 +693,9 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); } -- cgit v1.2.3 From e4251e130deef9de5226cc36faa70a1c6671d3c5 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove some cruft in apic id checking during processor setup - Remove a define that was used only once - Remove the too large APIC ID check because we always support the full 8bit range of APICs. - Restructure code a bit to be simpler. Cc: len.brown@intel.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/mpparse.c | 18 +----------------- include/asm-x86_64/acpi.h | 2 -- 2 files changed, 1 insertion(+), 19 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 32b6977f80c0..02eef8f6415f 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -100,7 +100,6 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) disabled_cpus++; return; } - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; @@ -118,13 +117,6 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); -#if MAX_APICS < 255 - if ((int)m->mpc_apicid > MAX_APICS) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } -#endif physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* @@ -603,8 +595,6 @@ void __init mp_register_lapic_address(u64 address) set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } void __cpuinit mp_register_lapic (u8 id, u8 enabled) @@ -612,13 +602,7 @@ void __cpuinit mp_register_lapic (u8 id, u8 enabled) struct mpc_config_processor processor; int boot_cpu = 0; - if (id >= MAX_APICS) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) + if (id == boot_cpu_id) boot_cpu = 1; processor.mpc_type = MP_PROCESSOR; diff --git a/include/asm-x86_64/acpi.h b/include/asm-x86_64/acpi.h index 2c95a319c056..ed59aa4c6ff9 100644 --- a/include/asm-x86_64/acpi.h +++ b/include/asm-x86_64/acpi.h @@ -155,8 +155,6 @@ extern void acpi_reserve_bootmem(void); #endif /*CONFIG_ACPI_SLEEP*/ -#define boot_cpu_physical_apicid boot_cpu_id - extern int acpi_disabled; extern int acpi_pci_disabled; -- cgit v1.2.3 From aecc63615e15de861db7436c50dade495639132c Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Fix coding style and output of the mptable parser Give the printks a consistent prefix. Add some missing white space. Cc: len.brown@intel.com Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/mpparse.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 02eef8f6415f..20e88f4b564b 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -205,7 +205,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) unsigned char *mpt=((unsigned char *)mpc)+count; if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("SMP mptable: bad signature [%c%c%c%c]!\n", + printk("MPTABLE: bad signature [%c%c%c%c]!\n", mpc->mpc_signature[0], mpc->mpc_signature[1], mpc->mpc_signature[2], @@ -213,31 +213,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) return 0; } if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("SMP mptable: checksum error!\n"); + printk("MPTABLE: checksum error!\n"); return 0; } if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", mpc->mpc_spec); return 0; } if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); return 0; } memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk(KERN_INFO "OEM ID: %s ",str); + str[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); + str[12] = 0; + printk("MPTABLE: Product ID: %s ",str); - printk("APIC at: 0x%X\n",mpc->mpc_lapic); + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); /* save the local APIC address, it might be non-default */ if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; + mp_lapic_addr = mpc->mpc_lapic; /* * Now process the configuration blocks. @@ -249,7 +249,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_processor *m= (struct mpc_config_processor *)mpt; if (!acpi_lapic) - MP_processor_info(m); + MP_processor_info(m); mpt += sizeof(*m); count += sizeof(*m); break; @@ -268,8 +268,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_ioapic *m= (struct mpc_config_ioapic *)mpt; MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_INTSRC: @@ -278,8 +278,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) (struct mpc_config_intsrc *)mpt; MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_LINTSRC: @@ -287,15 +287,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_lintsrc *m= (struct mpc_config_lintsrc *)mpt; MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } } } clustered_apic_check(); if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); + printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; } -- cgit v1.2.3 From 7a0a2dff1cac1df82acfa0395bc9bc1bf0bc16ef Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Add a missing check for irq flags tracing in NMI NMIs are not supposed to track the irq flags, but TRACE_IRQS_IRETQ did it anyways. Add a check. Cc: mingo@elte.hu Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/entry.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 780f9b26169f..4fcc0ad8bbeb 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -773,7 +773,9 @@ paranoid_exit\trace: testl $3,CS(%rsp) jnz paranoid_userspace\trace paranoid_swapgs\trace: + .if \trace TRACE_IRQS_IRETQ 0 + .endif swapgs paranoid_restore\trace: RESTORE_ALL 8 -- cgit v1.2.3 From b3698c03eb6d4581e879d6bb0f183ed8dda96d37 Mon Sep 17 00:00:00 2001 From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Fix boot code head.S warning When compiling a 64-bit kernel on an Ubuntu 6.06 32bit system (whose GCC is also a cross-compiler for x86_64) I've seen that head.o is compiled as a 64-bit file (while it should not) and ld complaining about this during linking: [AK: it happens on all systems with new binutils] ld: warning: i386:x86-64 architecture of input file `arch/x86_64/boot/compressed/head.o' is incompatible with i386 output I've verified that removing -m64 from compilation flags to turn "-m64 -traditional -m32" into "-traditional -m32" fixes the issue. Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/boot/compressed/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index f89d96f11a9f..e70fa6e1da08 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -7,7 +7,8 @@ # targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o -EXTRA_AFLAGS := -traditional -m32 +EXTRA_AFLAGS := -traditional +AFLAGS := $(subst -m64,-m32,$(AFLAGS)) # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with # -m32 -- cgit v1.2.3 From 8380aabb99719af583447133f19a4d8074b5c337 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:37 +0200 Subject: [PATCH] Remove non e820 fallbacks in high level code Drop support for non e820 BIOS calls to get the memory map. The boot assembler code still has some support, but not the C code now. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/e820.c | 28 +++++++++------------------- include/asm-x86_64/e820.h | 4 ---- 2 files changed, 9 insertions(+), 23 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index e06c2714ecf3..5d1275a93658 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -536,10 +536,14 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return 0; } -void __init setup_memory_region(void) +void early_panic(char *msg) { - char *who = "BIOS-e820"; + early_printk(msg); + panic(msg); +} +void __init setup_memory_region(void) +{ /* * Try to copy the BIOS-supplied E820-map. * @@ -547,24 +551,10 @@ void __init setup_memory_region(void) * the next section from 1mb->appropriate_mem_k */ sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; - who = "BIOS-88"; - } else { - mem_size = ALT_MEM_K; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); + e820_print_map("BIOS-e820"); } static int __init parse_memopt(char *p) diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h index dba012151856..8dac397bf85e 100644 --- a/include/asm-x86_64/e820.h +++ b/include/asm-x86_64/e820.h @@ -22,10 +22,6 @@ #define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */ #define E820_NVS 4 -#define HIGH_MEMORY (1024*1024) - -#define LOWMEMSIZE() (0x9f000) - #ifndef __ASSEMBLY__ struct e820entry { u64 addr; /* start of memory segment */ -- cgit v1.2.3 From 26374c7b7dca1ff90607c83d9b82e917119f0456 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Reload CS when startup_64 is used. In long mode the %cs is largely a relic. However there are a few cases like iret where it matters that we have a valid value. Without this patch it is possible to enter the kernel in startup_64 without setting %cs to a valid value. With this patch we don't care what %cs value we enter the kernel with, so long as the cs shadow register indicates it is a privileged code segment. Thanks to Magnus Damm for finding this problem and posting the first workable patch. I have moved the jump to set %cs down a few instructions so we don't need to take an extra jump. Which keeps the code simpler. Signed-of-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/head.S | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 4a3326ce709c..1e6f80870679 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -185,12 +185,15 @@ startup_64: /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump - * to the full 64bit address , this is only possible as indirect - * jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. */ movq initial_code(%rip),%rax - pushq $0 # fake return address - jmp *%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq /* SMP bootup changes these two */ .align 8 -- cgit v1.2.3 From f2a9e1dec27189a09ff642f2648e49ad9e76607b Mon Sep 17 00:00:00 2001 From: Ian Campbell <Ian.Campbell@XenSource.com> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Put .note.* sections into a PT_NOTE segment This patch updates x86_64 linker script to pack any .note.* sections into a PT_NOTE segment in the output file. To do this, we tell ld that we need a PT_NOTE segment. This requires us to start explicitly mapping sections to segments, so we also need to explicitly create PT_LOAD segments for text and data, and map the sections to them appropriately. Fortunately, each section will default to its previous section's segment, so it doesn't take many changes to vmlinux.lds.S. The corresponding change is already made for i386 in -mm and I'd like this patch to join it. The section to segment mappings do change as do the segment flags so some time in -mm would be good for that reason as well, just in case. In particular .data and .bss move from the text segment to the data segment and .data.cacheline_aligned .data.read_mostly are put in the data segment instead of a separate one. I think that it would be possible to exactly match the existing section to segment mapping and flags but it would be a more intrusive change and I'm not sure there is a reason for the existing layout other than it is what you get by default if you don't explicitly specify something else. If there is a reason for the existing layout then I will of course make the more intrusive change. If there is no reason we could probably drop the executable or writable flags from some segments but I don't know how much attention is paid to them anyway so it might not be worth the effort. The vsyscall related sections need to go in a different segment to the normal data segment and so I invented a "user" segment to contain them. I believe this should appear to be another data segment as far as the kernel is concerned so the flags are setup accordingly. The notes will be used in the Xen paravirt_ops backend to provide additional information to the domain builder. I am in the process of converting the xen-unstable kernels and tools over to this scheme at the moment to support this in the future. It has been suggested to me that the notes segment should have flags 0 (i.e. not readable) since it is only used by the loader and is not used at runtime. For now I went with a readable segment since that is what the i386 patch uses. AK: dropped NOTES addition right now because the needed infrastructure for that is not merged yet Signed-off-by: Ian Campbell <ian.campbell@xensource.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/vmlinux.lds.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 8d5a5149bb3a..2802aa963fa3 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __START_KERNEL; @@ -31,7 +37,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 /* out-of-line lock text */ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } @@ -57,7 +63,7 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS - } + } :data _edata = .; /* End of data section */ @@ -89,7 +95,7 @@ SECTIONS #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); @@ -135,7 +141,7 @@ SECTIONS . = ALIGN(8192); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) - } + } :data . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { -- cgit v1.2.3 From 53ee11ae0d73f28029a5f0d991bc4dcd7c817e7a Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Optimize PDA accesses slightly Based on a idea by Jeremy Fitzhardinge: Replace the volatiles and memory clobbers in the PDA access with telling gcc about access to a proxy PDA structure that doesn't actually exist. But the dummy accesses give a defined ordering for read/write accesses. Also add some memory barriers to the early GS initialization to make sure no PDA access is moved before it. Advantage is some .text savings (probably most from better code for accessing "current"): text data bss dec hex filename 4845647 1223688 615864 6685199 66020f vmlinux 4837780 1223688 615864 6677332 65e354 vmlinux-pda 1.2% smaller code Cc: Jeremy Fitzhardinge <jeremy@goop.org> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup64.c | 3 +++ include/asm-x86_64/pda.h | 41 ++++++++++++++++++++++------------------- 2 files changed, 25 insertions(+), 19 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index e85cfbb49b63..491361752c70 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -121,7 +121,10 @@ void pda_init(int cpu) /* Setup up data that may be needed in __get_free_pages early */ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + /* Memory clobbers used to order PDA accessed */ + mb(); wrmsrl(MSR_GS_BASE, pda); + mb(); pda->cpunumber = cpu; pda->irqcount = -1; diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index b47c3df9ed1d..55e21da96e7a 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -36,40 +36,43 @@ extern struct x8664_pda boot_cpu_pda[]; * There is no fast way to get the base address of the PDA, all the accesses * have to mention %fs/%gs. So it needs to be done this Torvaldian way. */ -#define sizeof_field(type,field) (sizeof(((type *)0)->field)) -#define typeof_field(type,field) typeof(((type *)0)->field) - extern void __bad_pda_field(void); +/* proxy_pda doesn't actually exist, but tell gcc it is accessed + for all PDA accesses so it gets read/write dependencies right. */ +extern struct x8664_pda _proxy_pda; + #define pda_offset(field) offsetof(struct x8664_pda, field) #define pda_to_op(op,field,val) do { \ - typedef typeof_field(struct x8664_pda, field) T__; \ - switch (sizeof_field(struct x8664_pda, field)) { \ + typedef typeof(_proxy_pda.field) T__; \ + switch (sizeof(_proxy_pda.field)) { \ case 2: \ -asm volatile(op "w %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ +asm(op "w %1,%%gs:%P2" : "+m" (_proxy_pda.field) : \ + "ri" ((T__)val),"i"(pda_offset(field))); break; \ case 4: \ -asm volatile(op "l %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ +asm(op "l %1,%%gs:%P2" : "+m" (_proxy_pda.field) : \ + "ri" ((T__)val),"i"(pda_offset(field))); break; \ case 8: \ -asm volatile(op "q %0,%%gs:%P1"::"ri" ((T__)val),"i"(pda_offset(field)):"memory"); break; \ - default: __bad_pda_field(); \ +asm(op "q %1,%%gs:%P2": "+m" (_proxy_pda.field) : \ + "ri" ((T__)val),"i"(pda_offset(field))); break; \ +default: __bad_pda_field(); \ } \ } while (0) -/* - * AK: PDA read accesses should be neither volatile nor have an memory clobber. - * Unfortunately removing them causes all hell to break lose currently. - */ #define pda_from_op(op,field) ({ \ - typeof_field(struct x8664_pda, field) ret__; \ - switch (sizeof_field(struct x8664_pda, field)) { \ + typeof(_proxy_pda.field) ret__; \ + switch (sizeof(_proxy_pda.field)) { \ case 2: \ -asm volatile(op "w %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ +asm(op "w %%gs:%P1,%0":"=r" (ret__):\ + "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ case 4: \ -asm volatile(op "l %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ +asm(op "l %%gs:%P1,%0":"=r" (ret__):\ + "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ case 8: \ -asm volatile(op "q %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ - default: __bad_pda_field(); \ +asm(op "q %%gs:%P1,%0":"=r" (ret__):\ + "i" (pda_offset(field)), "m" (_proxy_pda.field)); break;\ +default: __bad_pda_field(); \ } \ ret__; }) -- cgit v1.2.3 From f574164491d00d28b727d713685fb5edc9138200 Mon Sep 17 00:00:00 2001 From: Keith Owens <kaos@ocs.com.au> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Remove most of the special cases for the debug IST stack Remove most of the special cases for the debug IST stack. This is a follow on clean up patch, it requires the bug fix patch that adds orig_ist. Signed-off-by: Keith Owens <kaos@ocs.com.au> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup64.c | 21 +++++---------------- arch/x86_64/kernel/traps.c | 21 +-------------------- include/asm-x86_64/pda.h | 3 --- 3 files changed, 6 insertions(+), 39 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 491361752c70..9332d2361e08 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -237,28 +237,17 @@ void __cpuinit cpu_init (void) * set up and load the per-CPU TSS */ for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; if (cpu) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); if (!estacks) panic("Cannot allocate exception stack %ld %d\n", v, cpu); } - switch (v + 1) { -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - cpu_pda(cpu)->debugstack = (unsigned long)estacks; - estacks += DEBUG_STKSZ; - break; -#endif - default: - estacks += EXCEPTION_STKSZ; - break; - } + estacks += PAGE_SIZE << order[v]; orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 38bc821e457b..fb8486eca1b6 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -162,26 +162,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * 'stack' is in one of them: */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end; - - /* - * set 'end' to the end of the exception stack. - */ - switch (k + 1) { - /* - * TODO: this block is not needed i think, because - * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK] - * properly too. - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; - break; -#endif - default: - end = per_cpu(orig_ist, cpu).ist[k]; - break; - } + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; /* * Is 'stack' above this exception frame's end? * If yes then skip to the next frame. diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 55e21da96e7a..e2b23e337b94 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -13,9 +13,6 @@ struct x8664_pda { unsigned long data_offset; /* Per cpu data offset from linker address */ unsigned long kernelstack; /* top of kernel stack for current */ unsigned long oldrsp; /* user rsp for system call */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - unsigned long debugstack; /* #DB/#BP stack. */ -#endif int irqcount; /* Irq nesting counter. Starts with -1 */ int cpunumber; /* Logical CPU number */ char *irqstackptr; /* top of irqstack */ -- cgit v1.2.3 From 4bfaaef01a1badb9e8ffb0c0a37cd2379008d21f Mon Sep 17 00:00:00 2001 From: Magnus Damm <magnus@valinux.co.jp> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Avoid overwriting the current pgd (V4, x86_64) kexec: Avoid overwriting the current pgd (V4, x86_64) This patch upgrades the x86_64-specific kexec code to avoid overwriting the current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used to start a secondary kernel that dumps the memory of the previous kernel. The code introduces a new set of page tables. These tables are used to provide an executable identity mapping without overwriting the current pgd. Signed-off-by: Magnus Damm <magnus@valinux.co.jp> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/machine_kexec.c | 71 ++++++++------- arch/x86_64/kernel/relocate_kernel.S | 171 +++++++++++++++++++++++++++++++---- include/asm-x86_64/kexec.h | 29 ++++++ 3 files changed, 218 insertions(+), 53 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 2e94c072d84a..0497e3bd5bff 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -15,6 +15,15 @@ #include <asm/mmu_context.h> #include <asm/io.h> +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u64 kexec_pgd[512] PAGE_ALIGNED; +static u64 kexec_pud0[512] PAGE_ALIGNED; +static u64 kexec_pmd0[512] PAGE_ALIGNED; +static u64 kexec_pte0[512] PAGE_ALIGNED; +static u64 kexec_pud1[512] PAGE_ALIGNED; +static u64 kexec_pmd1[512] PAGE_ALIGNED; +static u64 kexec_pte1[512] PAGE_ALIGNED; + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -144,32 +153,19 @@ static void load_segments(void) ); } -typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, - unsigned long control_code_buffer, - unsigned long start_address, - unsigned long pgtable) ATTRIB_NORET; - -extern const unsigned char relocate_new_kernel[]; -extern const unsigned long relocate_new_kernel_size; - int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable, control_code_buffer; + unsigned long start_pgtable; int result; /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); if (result) return result; - /* Place the code in the reboot code buffer */ - memcpy(__va(control_code_buffer), relocate_new_kernel, - relocate_new_kernel_size); - return 0; } @@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage *image) */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long control_code_buffer; - unsigned long start_pgtable; - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Calculate the offsets */ - page_list = image->head; - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; - - /* Set the low half of the page table to my identity mapped - * page table for kexec. Leave the high half pointing at the - * kernel pages. Don't bother to flush the global pages - * as that will happen when I fully switch to my identity mapped - * page table anyway. - */ - memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); - __flush_tlb(); - + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) */ set_gdt(phys_to_virt(0),0); set_idt(phys_to_virt(0),0); + /* now call it */ - rnk = (relocate_new_kernel_t) control_code_buffer; - (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); } /* crashkernel=size@addr specifies the location to reserve for diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S index d24fa9b72a2b..14e95872c6a3 100644 --- a/arch/x86_64/kernel/relocate_kernel.S +++ b/arch/x86_64/kernel/relocate_kernel.S @@ -7,31 +7,169 @@ */ #include <linux/linkage.h> +#include <asm/page.h> +#include <asm/kexec.h> - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 3) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ + + .text + .align PAGE_ALIGNED .code64 + .globl relocate_kernel +relocate_kernel: + /* %rdi indirection_page + * %rsi page_list + * %rdx start address + */ + + /* map the control page at its virtual address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + /* identity map the control page at its physical address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + relocate_new_kernel: - /* %rdi page_list - * %rsi reboot_code_buffer + /* %rdi indirection_page + * %rsi page_list * %rdx start address - * %rcx page_table - * %r8 arg5 - * %r9 arg6 */ /* zero out flags, and disable interrupts */ pushq $0 popfq - /* set a new stack at the bottom of our page... */ - lea 4096(%rsi), %rsp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %rcx - /* store the parameters back on the stack */ - pushq %rdx /* store the start address */ + /* switch to new set of page tables */ + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ret + +identity_mapped: + /* store the start address on the stack */ + pushq %rdx /* Set cr0 to a known state: * 31 1 == Paging enabled @@ -136,8 +274,3 @@ relocate_new_kernel: xorq %r15, %r15 ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/include/asm-x86_64/kexec.h b/include/asm-x86_64/kexec.h index c564bae03433..5fab957e1091 100644 --- a/include/asm-x86_64/kexec.h +++ b/include/asm-x86_64/kexec.h @@ -1,6 +1,27 @@ #ifndef _X86_64_KEXEC_H #define _X86_64_KEXEC_H +#define PA_CONTROL_PAGE 0 +#define VA_CONTROL_PAGE 1 +#define PA_PGD 2 +#define VA_PGD 3 +#define PA_PUD_0 4 +#define VA_PUD_0 5 +#define PA_PMD_0 6 +#define VA_PMD_0 7 +#define PA_PTE_0 8 +#define VA_PTE_0 9 +#define PA_PUD_1 10 +#define VA_PUD_1 11 +#define PA_PMD_1 12 +#define VA_PMD_1 13 +#define PA_PTE_1 14 +#define VA_PTE_1 15 +#define PA_TABLE_PAGE 16 +#define PAGES_NR 17 + +#ifndef __ASSEMBLY__ + #include <linux/string.h> #include <asm/page.h> @@ -64,4 +85,12 @@ static inline void crash_setup_regs(struct pt_regs *newregs, newregs->rip = (unsigned long)current_text_addr(); } } + +NORET_TYPE void +relocate_kernel(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address) ATTRIB_NORET; + +#endif /* __ASSEMBLY__ */ + #endif /* _X86_64_KEXEC_H */ -- cgit v1.2.3 From e8c7391de4cd91b2cbb0c791f69ba1e066595848 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Don't use kernel_text_address in oops context Because it can take spinlocks. Suggested by Mathieu Desnoyers Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/traps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index fb8486eca1b6..01f2a8d254c2 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -299,7 +299,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ - if (kernel_text_address(addr)) { \ + if (oops_in_progress ? \ + __kernel_text_address(addr) : \ + kernel_text_address(addr)) { \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ -- cgit v1.2.3 From b62a5c740df1e3d49a97349fce0c6a23f633d7fe Mon Sep 17 00:00:00 2001 From: Arjan van de Ven <arjan@linux.intel.com> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Add the Kconfig option for the stackprotector feature This patch adds the config options for -fstack-protector. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andi Kleen <ak@suse.de> CC: Andi Kleen <ak@suse.de> --- arch/x86_64/Kconfig | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 927b392a4aa0..0c61d0019dd7 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -533,6 +533,30 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +config CC_STACKPROTECTOR + bool "Enable -fstack-protector buffer overflow detection (EXPRIMENTAL)" + depends on EXPERIMENTAL + help + This option turns on the -fstack-protector GCC feature. This + feature puts, at the beginning of critical functions, a canary + value on the stack just before the return address, and validates + the value just before actually returning. Stack based buffer + overflows (that need to overwrite this return address) now also + overwrite the canary, which gets detected and the attack is then + neutralized via a kernel panic. + + This feature requires gcc version 4.2 or above, or a distribution + gcc with the feature backported. Older versions are automatically + detected and for those versions, this configuration option is ignored. + +config CC_STACKPROTECTOR_ALL + bool "Use stack-protector for all functions" + depends on CC_STACKPROTECTOR + help + Normally, GCC only inserts the canary value protection for + functions that use large-ish on-stack buffers. By enabling + this option, GCC will be asked to do this for ALL functions. + source kernel/Kconfig.hz config REORDER -- cgit v1.2.3 From 0a4254058037eb172758961d0a5b94f4320a1425 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven <arjan@linux.intel.com> Date: Tue, 26 Sep 2006 10:52:38 +0200 Subject: [PATCH] Add the canary field to the PDA area and the task struct This patch adds the per thread cookie field to the task struct and the PDA. Also it makes sure that the PDA value gets the new cookie value at context switch, and that a new task gets a new cookie at task creation time. Signed-off-by: Arjan van Ven <arjan@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andi Kleen <ak@suse.de> CC: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/process.c | 8 ++++++++ include/asm-x86_64/pda.h | 7 ++++++- include/linux/sched.h | 5 +++++ kernel/fork.c | 5 +++++ 4 files changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 9e9a70e50c72..fba8dfeda67c 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -625,6 +625,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unlazy_fpu(prev_p); write_pda(kernelstack, task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); +#ifdef CONFIG_CC_STACKPROTECTOR + write_pda(stack_canary, next_p->stack_canary); + /* + * Build time only check to make sure the stack_canary is at + * offset 40 in the pda; this is a gcc ABI requirement + */ + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +#endif /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 6794ffaae433..e7773e0af865 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -16,7 +16,12 @@ struct x8664_pda { unsigned long oldrsp; /* 24 user rsp for system call */ int irqcount; /* 32 Irq nesting counter. Starts with -1 */ int cpunumber; /* 36 Logical CPU number */ - char *irqstackptr; /* 40 top of irqstack */ +#ifdef CONFIG_CC_STACKPROTECTOR + unsigned long stack_canary; /* 40 stack canary value */ + /* gcc-ABI: this canary MUST be at + offset 40!!! */ +#endif + char *irqstackptr; int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 807556c5bcd2..9d4aa7f95bc8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -819,6 +819,11 @@ struct task_struct { unsigned did_exec:1; pid_t pid; pid_t tgid; + +#ifdef CONFIG_CC_STACKPROTECTOR + /* Canary value for the -fstack-protector gcc feature */ + unsigned long stack_canary; +#endif /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with diff --git a/kernel/fork.c b/kernel/fork.c index f9b014e3e700..a0dad84567c9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -45,6 +45,7 @@ #include <linux/cn_proc.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> +#include <linux/random.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -175,6 +176,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->thread_info = ti; setup_thread_stack(tsk, orig); +#ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +#endif + /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); atomic_set(&tsk->fs_excl, 0); -- cgit v1.2.3 From 4f7fd4d7a79193ceda4ce77f75e22917d33fa154 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven <arjan@linux.intel.com> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Add the -fstack-protector option to the CFLAGS Add a feature check that checks that the gcc compiler has stack-protector support and has the bugfix for PR28281 to make this work in kernel mode. The easiest solution I could find was to have a shell script in scripts/ to do the detection; if needed we can make this fancier in the future without making the makefile too complex. Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Andi Kleen <ak@suse.de> CC: Andi Kleen <ak@suse.de> CC: Sam Ravnborg <sam@ravnborg.org> --- arch/x86_64/Makefile | 3 +++ scripts/gcc-x86_64-has-stack-protector.sh | 6 ++++++ 2 files changed, 9 insertions(+) create mode 100644 scripts/gcc-x86_64-has-stack-protector.sh (limited to 'arch/x86_64') diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index d6472ddf5f6e..2b8d07c70106 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -58,6 +58,9 @@ cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector ) +cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all ) + CFLAGS += $(cflags-y) CFLAGS_KERNEL += $(cflags-kernel-y) AFLAGS += -m64 diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh new file mode 100644 index 000000000000..325c0a1b03b6 --- /dev/null +++ b/scripts/gcc-x86_64-has-stack-protector.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +echo "int foo(void) { char X[200]; return 3; }" | $1 -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" +if [ "$?" -eq "0" ] ; then + echo $2 +fi -- cgit v1.2.3 From 96e540492ab54423f3693958329e095878f1f12b Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix a irqcount comment in entry.S Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/entry.S | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 4fcc0ad8bbeb..ea32688386fd 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -519,7 +519,12 @@ END(stub_rt_sigreturn) testl $3,CS(%rdi) je 1f swapgs -1: incl %gs:pda_irqcount # RED-PEN should check preempt count + /* irqcount is used to check if a CPU is already on an interrupt + stack or not. While this is essentially redundant with preempt_count + it is a little cheaper to use a separate counter in the PDA + (short of moving irq_enter into assembly, which would be too + much work) */ +1: incl %gs:pda_irqcount cmoveq %gs:pda_irqstackptr,%rsp push %rbp # backlink for old unwinder /* -- cgit v1.2.3 From 7b0bda74f7e77f362eaeee837e7911238acf4c76 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix a PDA warning uncovered by the new type checking Fix linux/arch/x86_64/kernel/process.c: In function __switch_to: linux/arch/x86_64/kernel/process.c:626: warning: assignment makes integer from pointer without a cast Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index fba8dfeda67c..885c318f76ab 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -624,7 +624,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) And the AMD workaround requires it to be after DS reload. */ unlazy_fpu(prev_p); write_pda(kernelstack, - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* -- cgit v1.2.3 From 54dbc0c9ebefb38840c6b07fa6eabaeb96c921f5 Mon Sep 17 00:00:00 2001 From: "adurbin@google.com" <adurbin@google.com> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] insert IOAPIC(s) and Local APIC into resource map This patch places the IOAPIC(s) and the Local APIC specified by ACPI tables into the resource map. The APICs will then be visible within /proc/iomem Signed-off-by: Aaron Durbin <adurbin@google.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/apic.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 6472e321cad7..135ff25e6b44 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/module.h> +#include <linux/ioport.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -45,6 +46,11 @@ int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -585,6 +591,40 @@ static int __init detect_init_APIC (void) return 0; } +#ifdef CONFIG_X86_IO_APIC +static struct resource * __init ioapic_setup_resources(void) +{ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + res = alloc_bootmem(n); + + if (!res) + return NULL; + + memset(res, 0, n); + mem = (void *)&res[nr_ioapics]; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + + return res; +} +#endif + void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -604,6 +644,11 @@ void __init init_apic_mappings(void) apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). @@ -613,7 +658,9 @@ void __init init_apic_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; + struct resource *ioapic_res; + ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mpc_apicaddr; @@ -625,6 +672,13 @@ void __init init_apic_mappings(void) apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; + + if (ioapic_res) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + insert_resource(&iomem_resource, ioapic_res); + ioapic_res++; + } } } } -- cgit v1.2.3 From 3022d734a54cbd2b65eea9a024564821101b4a9a Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Fix zeroing on exception in copy_*_user - Don't zero for __copy_from_user_inatomic following i386. This will prevent spurious zeros for parallel file system writers when one does a exception - The string instruction version didn't zero the output on exception. Oops. Also I cleaned up the code a bit while I was at it and added a minor optimization to the string instruction path. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/x8664_ksyms.c | 1 + arch/x86_64/lib/copy_user.S | 124 +++++++++++++++++++++++++-------------- include/asm-x86_64/uaccess.h | 6 +- 3 files changed, 83 insertions(+), 48 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 370952c4ff22..c3454af5e3a2 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -29,6 +29,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index 962f3a693c5e..70bebd310408 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -9,10 +9,29 @@ #define FIX_ALIGNMENT 1 - #include <asm/current.h> - #include <asm/asm-offsets.h> - #include <asm/thread_info.h> - #include <asm/cpufeature.h> +#include <asm/current.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/cpufeature.h> + + .macro ALTERNATIVE_JUMP feature,orig,alt +0: + .byte 0xe9 /* 32bit jump */ + .long \orig-1f /* by default jump to orig */ +1: + .section .altinstr_replacement,"ax" +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ + .previous + .section .altinstructions,"a" + .align 8 + .quad 0b + .quad 2b + .byte \feature /* when feature is set */ + .byte 5 + .byte 5 + .previous + .endm /* Standard copy_to_user with segment limit checking */ ENTRY(copy_to_user) @@ -23,25 +42,21 @@ ENTRY(copy_to_user) jc bad_to_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_to_user -2: - .byte 0xe9 /* 32bit jump */ - .long .Lcug-1f -1: + xorl %eax,%eax /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC -ENDPROC(copy_to_user) - .section .altinstr_replacement,"ax" -3: .byte 0xe9 /* replacement jmp with 32 bit immediate */ - .long copy_user_generic_c-1b /* offset */ - .previous - .section .altinstructions,"a" - .align 8 - .quad 2b - .quad 3b - .byte X86_FEATURE_REP_GOOD - .byte 5 - .byte 5 - .previous +ENTRY(copy_user_generic) + CFI_STARTPROC + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(__copy_from_user_inatomic) + CFI_STARTPROC + xorl %ecx,%ecx /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC /* Standard copy_from_user with segment limit checking */ ENTRY(copy_from_user) @@ -52,7 +67,8 @@ ENTRY(copy_from_user) jc bad_from_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_from_user - /* FALL THROUGH to copy_user_generic */ + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC ENDPROC(copy_from_user) @@ -73,37 +89,26 @@ END(bad_from_user) /* - * copy_user_generic - memory copy with exception handling. + * copy_user_generic_unrolled - memory copy with exception handling. + * This version is for CPUs like P4 that don't have efficient micro code for rep movsq * * Input: * rdi destination * rsi source * rdx count + * ecx zero flag -- if true zero destination on error * * Output: * eax uncopied bytes or 0 if successful. */ -ENTRY(copy_user_generic) +ENTRY(copy_user_generic_unrolled) CFI_STARTPROC - .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ - .byte 0x66,0x90 -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long copy_user_generic_c-1b /* offset */ - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_user_generic - .quad 2b - .byte X86_FEATURE_REP_GOOD - .byte 5 - .byte 5 - .previous -.Lcug: pushq %rbx CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rbx, 0 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 xorl %eax,%eax /*zero for the exception handler */ #ifdef FIX_ALIGNMENT @@ -179,6 +184,9 @@ ENTRY(copy_user_generic) CFI_REMEMBER_STATE .Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx popq %rbx CFI_ADJUST_CFA_OFFSET -8 CFI_RESTORE rbx @@ -265,6 +273,8 @@ ENTRY(copy_user_generic) addl %ecx,%edx /* edx: bytes to zero, rdi: dest, eax:zero */ .Lzero_rest: + cmpl $0,(%rsp) + jz .Le_zero movq %rdx,%rcx .Le_byte: xorl %eax,%eax @@ -286,6 +296,7 @@ ENDPROC(copy_user_generic) /* rdi destination * rsi source * rdx count + * ecx zero flag * * Output: * eax uncopied bytes or 0 if successfull. @@ -296,25 +307,48 @@ ENDPROC(copy_user_generic) * And more would be dangerous because both Intel and AMD have * errata with rep movsq > 4GB. If someone feels the need to fix * this please consider this. - */ -copy_user_generic_c: + */ +ENTRY(copy_user_generic_string) CFI_STARTPROC + movl %ecx,%r8d /* save zero flag */ movl %edx,%ecx shrl $3,%ecx andl $7,%edx + jz 10f 1: rep movsq movl %edx,%ecx 2: rep movsb -4: movl %ecx,%eax +9: movl %ecx,%eax ret -3: lea (%rdx,%rcx,8),%rax + + /* multiple of 8 byte */ +10: rep + movsq + xor %eax,%eax ret + + /* exception handling */ +3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ + jmp 6f +5: movl %ecx,%eax /* exception on byte loop */ + /* eax: left over bytes */ +6: testl %r8d,%r8d /* zero flag set? */ + jz 7f + movl %eax,%ecx /* initialize x86 loop counter */ + push %rax + xorl %eax,%eax +8: rep + stosb /* zero the rest */ +11: pop %rax +7: ret CFI_ENDPROC END(copy_user_generic_c) .section __ex_table,"a" .quad 1b,3b - .quad 2b,4b + .quad 2b,5b + .quad 8b,11b + .quad 10b,3b .previous diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h index 1e1fa003daa3..bc6812009bd7 100644 --- a/include/asm-x86_64/uaccess.h +++ b/include/asm-x86_64/uaccess.h @@ -238,6 +238,7 @@ do { \ /* Handles exceptions in both to and from, but doesn't do access_ok */ extern unsigned long copy_user_generic(void *to, const void *from, unsigned len); +extern unsigned long copy_user_generic_dontzero(void *to, const void *from, unsigned len); extern unsigned long copy_to_user(void __user *to, const void *from, unsigned len); extern unsigned long copy_from_user(void *to, const void __user *from, unsigned len); @@ -303,7 +304,6 @@ static __always_inline int __copy_to_user(void __user *dst, const void *src, uns } } - static __always_inline int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; @@ -352,7 +352,7 @@ long strlen_user(const char __user *str); unsigned long clear_user(void __user *mem, unsigned long len); unsigned long __clear_user(void __user *mem, unsigned long len); -#define __copy_to_user_inatomic __copy_to_user -#define __copy_from_user_inatomic __copy_from_user +extern long __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size); +#define __copy_to_user_inatomic copy_user_generic #endif /* __X86_64_UACCESS_H */ -- cgit v1.2.3 From 95912008ba1fb9d0677c1ce5930aeb0e85ba5710 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:39 +0200 Subject: [PATCH] Add __must_check to copy_*_user Following i386. And also fix the two occurrences that caused warnings in arch/x86_64/* Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ptrace32.c | 6 ++++-- include/asm-x86_64/i387.h | 4 ++-- include/asm-x86_64/uaccess.h | 46 ++++++++++++++++++++++++++------------------ 3 files changed, 33 insertions(+), 23 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 72bf92a9d375..d18198ed636b 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -375,8 +375,10 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) ret = -EIO; if (!access_ok(VERIFY_READ, u, sizeof(*u))) break; - /* no checking to be bug-to-bug compatible with i386 */ - __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)); + /* no checking to be bug-to-bug compatible with i386. */ + /* but silence warning */ + if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u))) + ; set_stopped_child_used_math(child); child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; ret = 0; diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h index 60c0f4853fdb..0217b74cc9fc 100644 --- a/include/asm-x86_64/i387.h +++ b/include/asm-x86_64/i387.h @@ -137,8 +137,8 @@ static inline int save_i387_checking(struct i387_fxsave_struct __user *fx) #else : [fx] "cdaSDb" (fx), "0" (0)); #endif - if (unlikely(err)) - __clear_user(fx, sizeof(struct i387_fxsave_struct)); + if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct))) + err = -EFAULT; /* No need to clear here because the caller clears USED_MATH */ return err; } diff --git a/include/asm-x86_64/uaccess.h b/include/asm-x86_64/uaccess.h index bc6812009bd7..802a4a068ef6 100644 --- a/include/asm-x86_64/uaccess.h +++ b/include/asm-x86_64/uaccess.h @@ -237,14 +237,18 @@ do { \ */ /* Handles exceptions in both to and from, but doesn't do access_ok */ -extern unsigned long copy_user_generic(void *to, const void *from, unsigned len); -extern unsigned long copy_user_generic_dontzero(void *to, const void *from, unsigned len); - -extern unsigned long copy_to_user(void __user *to, const void *from, unsigned len); -extern unsigned long copy_from_user(void *to, const void __user *from, unsigned len); -extern unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len); - -static __always_inline int __copy_from_user(void *dst, const void __user *src, unsigned size) +__must_check unsigned long +copy_user_generic(void *to, const void *from, unsigned len); + +__must_check unsigned long +copy_to_user(void __user *to, const void *from, unsigned len); +__must_check unsigned long +copy_from_user(void *to, const void __user *from, unsigned len); +__must_check unsigned long +copy_in_user(void __user *to, const void __user *from, unsigned len); + +static __always_inline __must_check +int __copy_from_user(void *dst, const void __user *src, unsigned size) { int ret = 0; if (!__builtin_constant_p(size)) @@ -273,7 +277,8 @@ static __always_inline int __copy_from_user(void *dst, const void __user *src, u } } -static __always_inline int __copy_to_user(void __user *dst, const void *src, unsigned size) +static __always_inline __must_check +int __copy_to_user(void __user *dst, const void *src, unsigned size) { int ret = 0; if (!__builtin_constant_p(size)) @@ -304,7 +309,8 @@ static __always_inline int __copy_to_user(void __user *dst, const void *src, uns } } -static __always_inline int __copy_in_user(void __user *dst, const void __user *src, unsigned size) +static __always_inline __must_check +int __copy_in_user(void __user *dst, const void __user *src, unsigned size) { int ret = 0; if (!__builtin_constant_p(size)) @@ -344,15 +350,17 @@ static __always_inline int __copy_in_user(void __user *dst, const void __user *s } } -long strncpy_from_user(char *dst, const char __user *src, long count); -long __strncpy_from_user(char *dst, const char __user *src, long count); -long strnlen_user(const char __user *str, long n); -long __strnlen_user(const char __user *str, long n); -long strlen_user(const char __user *str); -unsigned long clear_user(void __user *mem, unsigned long len); -unsigned long __clear_user(void __user *mem, unsigned long len); - -extern long __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size); +__must_check long +strncpy_from_user(char *dst, const char __user *src, long count); +__must_check long +__strncpy_from_user(char *dst, const char __user *src, long count); +__must_check long strnlen_user(const char __user *str, long n); +__must_check long __strnlen_user(const char __user *str, long n); +__must_check long strlen_user(const char __user *str); +__must_check unsigned long clear_user(void __user *mem, unsigned long len); +__must_check unsigned long __clear_user(void __user *mem, unsigned long len); + +__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size); #define __copy_to_user_inatomic copy_user_generic #endif /* __X86_64_UACCESS_H */ -- cgit v1.2.3 From 26c13f2b5bbb03f798f8907db20296347e6c7ca6 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Check return values of __copy_to_user in uname emulation Quietens some new warnings Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/sys_ia32.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 33cd0a4fe388..b0e82c7947dc 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -780,7 +780,7 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, asmlinkage long sys32_olduname(struct oldold_utsname __user * name) { - int error; + int err; if (!name) return -EFAULT; @@ -789,27 +789,31 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name) down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - __put_user(0,name->sysname+__OLD_UTS_LEN); - __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - __put_user(0,name->nodename+__OLD_UTS_LEN); - __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - __put_user(0,name->release+__OLD_UTS_LEN); - __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - __put_user(0,name->version+__OLD_UTS_LEN); + err = __copy_to_user(&name->sysname,&system_utsname.sysname, + __OLD_UTS_LEN); + err |= __put_user(0,name->sysname+__OLD_UTS_LEN); + err |= __copy_to_user(&name->nodename,&system_utsname.nodename, + __OLD_UTS_LEN); + err |= __put_user(0,name->nodename+__OLD_UTS_LEN); + err |= __copy_to_user(&name->release,&system_utsname.release, + __OLD_UTS_LEN); + err |= __put_user(0,name->release+__OLD_UTS_LEN); + err |= __copy_to_user(&name->version,&system_utsname.version, + __OLD_UTS_LEN); + err |= __put_user(0,name->version+__OLD_UTS_LEN); { char *arch = "x86_64"; if (personality(current->personality) == PER_LINUX32) arch = "i686"; - __copy_to_user(&name->machine,arch,strlen(arch)+1); + err |= __copy_to_user(&name->machine,arch,strlen(arch)+1); } up_read(&uts_sem); - error = error ? -EFAULT : 0; + err = err ? -EFAULT : 0; - return error; + return err; } long sys32_uname(struct old_utsname __user * name) -- cgit v1.2.3 From b38337a624c4d3c2c3d9cdf27d952ca94181c6a8 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Mark per cpu data initialization __initdata again Before 2.6.16 this was changed to work around code that accessed CPUs not in the possible map. But that code should be all fixed now, so mark it __initdata again. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/vmlinux.lds.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 2802aa963fa3..d0564f1bcb0b 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -216,14 +216,12 @@ SECTIONS __initramfs_start = .; .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; - /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+ - complain */ - . = ALIGN(4096); - __init_end = .; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; + . = ALIGN(4096); + __init_end = .; . = ALIGN(4096); __nosave_begin = .; -- cgit v1.2.3 From 1c9c0a6ca35e9325cea811d734d6ab7352be086b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Remove experimental mark of kexec kexec has been marked experimental for a year now and all of the serious problems have been worked through. So it is time (if not past time) to remove the experimental mark. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 0c61d0019dd7..1f4212605ef8 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -468,8 +468,7 @@ config X86_MCE_AMD the DRAM Error Threshold. config KEXEC - bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "kexec system call" help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot -- cgit v1.2.3 From a15da49debaf7f09460a886b0ecd08588410715e Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Fix idle notifiers Previously exit_idle would be called more often than enter_idle Now instead of using complicated tests just keep track of it using the per CPU variable as a flip flop. I moved the idle state into the PDA to make the access more efficient. Original bug report and an initial patch from Stephane Eranian, but redone by AK. Cc: Stephane Eranian <eranian@hpl.hp.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/process.c | 15 +++++++++------ include/asm-x86_64/pda.h | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 885c318f76ab..458006ae19f3 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n) } EXPORT_SYMBOL(idle_notifier_unregister); -enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; - void enter_idle(void) { - __get_cpu_var(idle_state) = CPU_IDLE; + write_pda(isidle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } static void __exit_idle(void) { - __get_cpu_var(idle_state) = CPU_NOT_IDLE; + if (read_pda(isidle) == 0) + return; + write_pda(isidle, 0); atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } /* Called from interrupts to signify idle end */ void exit_idle(void) { - if (current->pid | read_pda(irqcount)) + /* idle loop has pid 0 */ + if (current->pid) return; __exit_idle(); } @@ -220,6 +220,9 @@ void cpu_idle (void) play_dead(); enter_idle(); idle(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ __exit_idle(); } diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 531f48a6c3af..14996d962bac 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -25,7 +25,8 @@ struct x8664_pda { int nodenumber; /* number of current node */ unsigned int __softirq_pending; unsigned int __nmi_count; /* number of NMI on this CPUs */ - int mmu_state; + short mmu_state; + short isidle; struct mm_struct *active_mm; unsigned apic_timer_irqs; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 5e544d618f0fb21011f36f28d5e3952b9dc109d2 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] i386/x86-64: PCI: split probing and initialization of type 1 config space access First probe if type1/2 accesses work, but then only initialize them at the end. This is useful for a later patch that needs this information inbetween. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/pci/direct.c | 25 ++++++++++++++++--------- arch/i386/pci/init.c | 9 +++++++-- arch/i386/pci/mmconfig.c | 2 +- arch/i386/pci/pci.h | 6 ++++-- arch/x86_64/pci/mmconfig.c | 2 +- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/pci/direct.c b/arch/i386/pci/direct.c index 5d81fb510375..5acf0b4743cf 100644 --- a/arch/i386/pci/direct.c +++ b/arch/i386/pci/direct.c @@ -254,7 +254,16 @@ static int __init pci_check_type2(void) return works; } -void __init pci_direct_init(void) +void __init pci_direct_init(int type) +{ + printk(KERN_INFO "PCI: Using configuration type %d\n", type); + if (type == 1) + raw_pci_ops = &pci_direct_conf1; + else + raw_pci_ops = &pci_direct_conf2; +} + +int __init pci_direct_probe(void) { struct resource *region, *region2; @@ -264,19 +273,16 @@ void __init pci_direct_init(void) if (!region) goto type2; - if (pci_check_type1()) { - printk(KERN_INFO "PCI: Using configuration type 1\n"); - raw_pci_ops = &pci_direct_conf1; - return; - } + if (pci_check_type1()) + return 1; release_resource(region); type2: if ((pci_probe & PCI_PROBE_CONF2) == 0) - return; + return 0; region = request_region(0xCF8, 4, "PCI conf2"); if (!region) - return; + return 0; region2 = request_region(0xC000, 0x1000, "PCI conf2"); if (!region2) goto fail2; @@ -284,10 +290,11 @@ void __init pci_direct_init(void) if (pci_check_type2()) { printk(KERN_INFO "PCI: Using configuration type 2\n"); raw_pci_ops = &pci_direct_conf2; - return; + return 2; } release_resource(region2); fail2: release_resource(region); + return 0; } diff --git a/arch/i386/pci/init.c b/arch/i386/pci/init.c index 51087a9d9172..d028e1b05c36 100644 --- a/arch/i386/pci/init.c +++ b/arch/i386/pci/init.c @@ -6,8 +6,13 @@ in the right sequence from here. */ static __init int pci_access_init(void) { + int type = 0; + +#ifdef CONFIG_PCI_DIRECT + type = pci_direct_probe(); +#endif #ifdef CONFIG_PCI_MMCONFIG - pci_mmcfg_init(); + pci_mmcfg_init(type); #endif if (raw_pci_ops) return 0; @@ -21,7 +26,7 @@ static __init int pci_access_init(void) * fails. */ #ifdef CONFIG_PCI_DIRECT - pci_direct_init(); + pci_direct_init(type); #endif return 0; } diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 972180f738d9..44155c5e85d1 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -187,7 +187,7 @@ static __init void unreachable_devices(void) } } -void __init pci_mmcfg_init(void) +void __init pci_mmcfg_init(int type) { if ((pci_probe & PCI_PROBE_MMCONF) == 0) return; diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index bf4e79335388..8a7cf1f23684 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -81,7 +81,9 @@ extern int pci_conf1_write(unsigned int seg, unsigned int bus, extern int pci_conf1_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value); -extern void pci_direct_init(void); +extern int pci_direct_probe(void); +extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern void pci_mmcfg_init(void); +extern void pci_mmcfg_init(int type); extern void pcibios_sort(void); + diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index dd78f4d18df1..d6b90d08476b 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -163,7 +163,7 @@ static __init void unreachable_devices(void) } } -void __init pci_mmcfg_init(void) +void __init pci_mmcfg_init(int type) { int i; -- cgit v1.2.3 From 9abd79280bbb9f56049f0168f412c3538cadb6eb Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] i386/x86-64: Only do MCFG e820 check when type 1 works Needs earlier patch to split type 1 probing from use. This patch should fix the x86 macs where type 1 PCI config space access doesn't work, but MCFG does. They also don't have a usable e820 table so the e820 sanity check failed. Instead assume now that if type 1 doesn't work then MCFG must work and don't do the e820 check. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/pci/mmconfig.c | 4 +++- arch/x86_64/pci/mmconfig.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index 44155c5e85d1..2c4f585510c9 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -198,7 +198,9 @@ void __init pci_mmcfg_init(int type) (pci_mmcfg_config[0].base_address == 0)) return; - if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + /* Only do this check when type 1 works. If it doesn't work + assume we run on a Mac and always use MCFG */ + if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address, pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN, E820_RESERVED)) { printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n", diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index d6b90d08476b..8bea51030d66 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -176,7 +176,9 @@ void __init pci_mmcfg_init(int type) (pci_mmcfg_config[0].base_address == 0)) return; - if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + /* Only do this check when type 1 works. If it doesn't work + assume we run on a Mac and always use MCFG */ + if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address, pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN, E820_RESERVED)) { printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n", -- cgit v1.2.3 From 56dd669a138c40ea6cdae487f233430d12372767 Mon Sep 17 00:00:00 2001 From: Aaron Durbin <adurbin@google.com> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] Insert GART region into resource map Patch inserts the GART region into the iomem resource map. The GART will then be visible within /proc/iomem. It will also allow for other users utilizing the GART to subreserve the region (agp or IOMMU). Signed-off-by: Aaron Durbin <adurbin@google.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/aperture.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 04dbf1662523..f851e1f9c82a 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -17,6 +17,7 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/bitops.h> +#include <linux/ioport.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/proto.h> @@ -33,6 +34,18 @@ int fallback_aper_force __initdata = 0; int fix_aperture __initdata = 1; +static struct resource gart_resource = { + .name = "GART", + .flags = IORESOURCE_MEM, +}; + +static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) +{ + gart_resource.start = aper_base; + gart_resource.end = aper_base + aper_size - 1; + insert_resource(&iomem_resource, &gart_resource); +} + /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ @@ -62,6 +75,7 @@ static u32 __init allocate_aperture(void) } printk("Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, __pa(p)); + insert_aperture_resource((u32)__pa(p), aper_size); return (u32)__pa(p); } @@ -233,8 +247,13 @@ void __init iommu_hole_init(void) last_aper_base = aper_base; } - if (!fix && !fallback_aper_force) + if (!fix && !fallback_aper_force) { + if (last_aper_base) { + unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); + } return; + } if (!fallback_aper_force) aper_alloc = search_agp_bridge(&aper_order, &valid_agp); -- cgit v1.2.3 From 4c6e052adfe285ede5884e4e8c4d33af33932c13 Mon Sep 17 00:00:00 2001 From: Aaron Durbin <adurbin@google.com> Date: Tue, 26 Sep 2006 10:52:40 +0200 Subject: [PATCH] MMCONFIG and new Intel motherboards On Sat, Sep 09, 2006 at 04:14:29PM +0200, Andi Kleen wrote: > [patch] Looks reasonable, but probably not for 2.6.18 because this stuff > is already too fragile and it is probably too risky to do any big changes now > since not enough testing time is left. Can you please resubmit > it with proper description and signed-off-by line? I can queue it for .19 then > > -Andi Patch inserts PCI memory mapped config region(s) into the resource map. This will allow for the MMCCONFIG regions to be marked as busy in the iomem address space as well as the regions(s) showing up in /proc/iomem. Signed-off-by: Aaron Durbin <adurbin@google.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/pci/mmconfig.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 8bea51030d66..8c0722b95cdc 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -163,6 +163,37 @@ static __init void unreachable_devices(void) } } +static __init void pci_mmcfg_insert_resources(void) +{ +#define PCI_MMCFG_RESOURCE_NAME_LEN 19 + int i; + struct resource *res; + char *names; + unsigned num_buses; + + res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), + pci_mmcfg_config_num, GFP_KERNEL); + + if (!res) { + printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); + return; + } + + names = (void *)&res[pci_mmcfg_config_num]; + for (i = 0; i < pci_mmcfg_config_num; i++, res++) { + num_buses = pci_mmcfg_config[i].end_bus_number - + pci_mmcfg_config[i].start_bus_number + 1; + res->name = names; + snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", + pci_mmcfg_config[i].pci_segment_group_number); + res->start = pci_mmcfg_config[i].base_address; + res->end = res->start + (num_buses << 20) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + names += PCI_MMCFG_RESOURCE_NAME_LEN; + } +} + void __init pci_mmcfg_init(int type) { int i; @@ -206,6 +237,7 @@ void __init pci_mmcfg_init(int type) } unreachable_devices(); + pci_mmcfg_insert_resources(); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; -- cgit v1.2.3 From 35d534a3ff4905c0a7bddbad0fc9df55967742c7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett <mjg59@srcf.ucam.org> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: - restore i8259A eoi status on resume Got it. i8259A_resume calls init_8259A(0) unconditionally, even if auto_eoi has been set. Keep track of the current status and restore that on resume. This fixes it for AMD64 and i386. Signed-off-by: Matthew Garrett <mjg59@srcf.ucam.org> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/i8259.c | 6 +++++- arch/x86_64/kernel/i8259.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index d4756d154f47..ea5f4e7958d8 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -45,6 +45,8 @@ static void end_8259A_irq (unsigned int irq) #define shutdown_8259A_irq disable_8259A_irq +static int i8259A_auto_eoi; + static void mask_and_ack_8259A(unsigned int); unsigned int startup_8259A_irq(unsigned int irq) @@ -253,7 +255,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } @@ -301,6 +303,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index d2f5acde5749..2dd51f364ea2 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -123,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = { DEFINE_SPINLOCK(i8259A_lock); +static int i8259A_auto_eoi; + static void end_8259A_irq (unsigned int irq) { if (irq > 256) { @@ -336,6 +338,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, 0x21); /* mask all of 8259A-1 */ @@ -394,7 +398,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } -- cgit v1.2.3 From 27fbe5b28a2ffef171c6005f304ea4f80fcdcc01 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Use string instructions for Core2 copy/clear It is faster than using a unrolled loop for the use cases the kernel cares about (cached, sizes typically < 4K) Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index e328e3eb8cbe..085f2e09b9dc 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -843,6 +843,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + if (c->x86 == 6) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); -- cgit v1.2.3 From 9ddab42d1e65dfbdb14588346d47fb142090ab40 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Remove outdated comment in x86-64 mmconfig code Cc: gregkh@suse.de Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/pci/mmconfig.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 8c0722b95cdc..7732f4254d21 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -218,7 +218,6 @@ void __init pci_mmcfg_init(int type) return; } - /* RED-PEN i386 doesn't do _nocache right now */ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk("PCI: Can not allocate memory for mmconfig structures\n"); -- cgit v1.2.3 From 2049336f60d297c85ac977245b9326ec00396114 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Don't synchronize time reading on single core AMD systems We do some additional CPU synchronization in gettimeofday et.al. to make sure the time stamps are always monotonic over multiple CPUs. But on single core systems that is not needed. So don't do it. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/setup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 085f2e09b9dc..26524ce3b239 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -728,6 +728,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* Fix cpuid4 emulation for more */ num_cache_leaves = 3; + + /* When there is only one core no need to synchronize RDTSC */ + if (num_possible_cpus() == 1) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) -- cgit v1.2.3 From dd54a11004b2c9a1f136225f880e021a43b0eadc Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Remove all traces of signal number conversion This was old code that was needed for iBCS and x86-64 never supported that. Pointed out by Albert Cahalan Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ia32_signal.c | 10 +--------- arch/x86_64/kernel/signal.c | 5 ----- 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 549de439fb2d..2d20f8304b31 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -431,15 +431,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - { - struct exec_domain *ed = current_thread_info()->exec_domain; - err |= __put_user((ed - && ed->signal_invmap - && sig < 32 - ? ed->signal_invmap[sig] - : sig), - &frame->sig); - } + err |= __put_user(sig, &frame->sig); if (err) goto give_sigsegv; diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 7f58bc9a056d..49ec324cd141 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -277,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } regs->rdi = sig; /* In case the signal handler was declared without prototypes */ regs->rax = 0; -- cgit v1.2.3 From 536e3ee4fed13d2d4bbf1b775174aba0cadf6aba Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Initialize argument registers for 32bit signal handlers. In case the user space was compiled with -mregparm=3 Following i386. Pointed out by Albert Cahalan Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ia32_signal.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 2d20f8304b31..ced15012a3d7 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -478,6 +478,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->rsp = (unsigned long) frame; regs->rip = (unsigned long) ka->sa.sa_handler; + /* Make -mregparm=3 work */ + regs->rax = sig; + regs->rdx = 0; + regs->rcx = 0; + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); -- cgit v1.2.3 From ab2e0b46cb9a197fab7d98e147cac7cd41a14047 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Fix some broken white space in ia32_signal.c No functional changes Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/ia32/ia32_signal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index ced15012a3d7..a6ba9951e86c 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -490,9 +490,9 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", @@ -587,8 +587,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, set_fs(USER_DS); regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", -- cgit v1.2.3 From adf1423698f00d00b267f7dca8231340ce7d65ef Mon Sep 17 00:00:00 2001 From: Jan Beulich <jbeulich@novell.com> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Work around gcc bug with noreturn functions in unwinder Current gcc generates calls not jumps to noreturn functions. When that happens the return address can point to the next function, which confuses the unwinder. This patch works around it by marking asynchronous exception frames in contrast normal call frames in the unwind information. Then teach the unwinder to decode this. For normal call frames the unwinder now subtracts one from the address which avoids this problem. The standard libgcc unwinder uses the same trick. It doesn't include adjustment of the printed address (i.e. for the original example, it'd still be kernel_math_error+0 that gets displayed, but the unwinder wouldn't get confused anymore. This only works with binutils 2.6.17+ and some versions of H.J.Lu's 2.6.16 unfortunately because earlier binutils don't support .cfi_signal_frame [AK: added automatic detection of the new binutils and wrote description] Signed-off-by: Jan Beulich <jbeulich@novell.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/Makefile | 4 ++++ arch/i386/kernel/entry.S | 4 ++++ arch/x86_64/Makefile | 4 ++++ arch/x86_64/ia32/ia32entry.S | 4 ++++ arch/x86_64/kernel/entry.S | 4 ++++ include/asm-i386/dwarf2.h | 7 +++++++ include/asm-i386/unwind.h | 5 +++++ include/asm-x86_64/dwarf2.h | 6 ++++++ include/asm-x86_64/unwind.h | 5 +++++ kernel/unwind.c | 35 ++++++++++++++++++++++++++++------- scripts/Kbuild.include | 4 ++-- 11 files changed, 73 insertions(+), 9 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 508cdbeb3a09..7cc0b189b82b 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -50,6 +50,10 @@ CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-op cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +# is .cfi_signal_frame supported too? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) + CFLAGS += $(cflags-y) # Default subarch .c files diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 284f2e908ad0..5a63d6fdb70e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -183,18 +183,21 @@ VM_MASK = 0x00020000 #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 3*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_EC_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, 4*4;\ /*CFI_OFFSET cs, -2*4;*/\ CFI_OFFSET eip, -3*4 #define RING0_PTREGS_FRAME \ CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ CFI_DEF_CFA esp, OLDESP-EBX;\ /*CFI_OFFSET cs, CS-OLDESP;*/\ CFI_OFFSET eip, EIP-OLDESP;\ @@ -275,6 +278,7 @@ need_resched: # sysenter call handler stub ENTRY(sysenter_entry) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp movl TSS_sysenter_esp0(%esp),%esp diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 2b8d07c70106..1c0f18d4f887 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -58,6 +58,10 @@ cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +# is .cfi_signal_frame supported too? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) + cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector ) cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all ) diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 32fd32bea07c..b4aa875e175b 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -71,6 +71,7 @@ */ ENTRY(ia32_sysenter_target) CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp swapgs @@ -186,6 +187,7 @@ ENDPROC(ia32_sysenter_target) */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ @@ -293,6 +295,7 @@ ia32_badarg: ENTRY(ia32_syscall) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP /*CFI_REL_OFFSET ss,SS-RIP*/ CFI_REL_OFFSET rsp,RSP-RIP @@ -370,6 +373,7 @@ ENTRY(ia32_ptregs_common) popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET CFI_REL_OFFSET rax,RAX-ARGOFFSET CFI_REL_OFFSET rcx,RCX-ARGOFFSET diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ea32688386fd..4cbc65290ae7 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -123,6 +123,7 @@ .macro CFI_DEFAULT_STACK start=1 .if \start CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8 .else CFI_DEF_CFA_OFFSET SS+8 @@ -207,6 +208,7 @@ END(ret_from_fork) ENTRY(system_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ @@ -324,6 +326,7 @@ END(system_call) */ ENTRY(int_ret_from_sys_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ CFI_REL_OFFSET rsp,RSP-ARGOFFSET @@ -484,6 +487,7 @@ END(stub_rt_sigreturn) */ .macro _frame ref CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-\ref /*CFI_REL_OFFSET ss,SS-\ref*/ CFI_REL_OFFSET rsp,RSP-\ref diff --git a/include/asm-i386/dwarf2.h b/include/asm-i386/dwarf2.h index 5d1a8db5a9b0..6d66398a307d 100644 --- a/include/asm-i386/dwarf2.h +++ b/include/asm-i386/dwarf2.h @@ -28,6 +28,12 @@ #define CFI_RESTORE_STATE .cfi_restore_state #define CFI_UNDEFINED .cfi_undefined +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif + #else /* Due to the structure of pre-exisiting code, don't use assembler line @@ -48,6 +54,7 @@ #define CFI_REMEMBER_STATE ignore #define CFI_RESTORE_STATE ignore #define CFI_UNDEFINED ignore +#define CFI_SIGNAL_FRAME ignore #endif diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h index f0ac399bae3c..5031d693b89d 100644 --- a/include/asm-i386/unwind.h +++ b/include/asm-i386/unwind.h @@ -18,6 +18,7 @@ struct unwind_frame_info { struct pt_regs regs; struct task_struct *task; + unsigned call_frame:1; }; #define UNW_PC(frame) (frame)->regs.eip @@ -44,6 +45,10 @@ struct unwind_frame_info PTREGS_INFO(edi), \ PTREGS_INFO(eip) +#define UNW_DEFAULT_RA(raItem, dataAlign) \ + ((raItem).where == Memory && \ + !((raItem).value * (dataAlign) + 4)) + static inline void arch_unw_init_frame_info(struct unwind_frame_info *info, /*const*/ struct pt_regs *regs) { diff --git a/include/asm-x86_64/dwarf2.h b/include/asm-x86_64/dwarf2.h index 2b9368365fad..eedc08526b0b 100644 --- a/include/asm-x86_64/dwarf2.h +++ b/include/asm-x86_64/dwarf2.h @@ -28,6 +28,11 @@ #define CFI_REMEMBER_STATE .cfi_remember_state #define CFI_RESTORE_STATE .cfi_restore_state #define CFI_UNDEFINED .cfi_undefined +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif #else @@ -45,6 +50,7 @@ #define CFI_REMEMBER_STATE # #define CFI_RESTORE_STATE # #define CFI_UNDEFINED # +#define CFI_SIGNAL_FRAME # #endif diff --git a/include/asm-x86_64/unwind.h b/include/asm-x86_64/unwind.h index 1f6e9bfb569e..b8fa5cb7ff88 100644 --- a/include/asm-x86_64/unwind.h +++ b/include/asm-x86_64/unwind.h @@ -18,6 +18,7 @@ struct unwind_frame_info { struct pt_regs regs; struct task_struct *task; + unsigned call_frame:1; }; #define UNW_PC(frame) (frame)->regs.rip @@ -57,6 +58,10 @@ struct unwind_frame_info PTREGS_INFO(r15), \ PTREGS_INFO(rip) +#define UNW_DEFAULT_RA(raItem, dataAlign) \ + ((raItem).where == Memory && \ + !((raItem).value * (dataAlign) + 8)) + static inline void arch_unw_init_frame_info(struct unwind_frame_info *info, /*const*/ struct pt_regs *regs) { diff --git a/kernel/unwind.c b/kernel/unwind.c index f69c804c8e62..3430475fcd88 100644 --- a/kernel/unwind.c +++ b/kernel/unwind.c @@ -603,6 +603,7 @@ int unwind(struct unwind_frame_info *frame) #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) const u32 *fde = NULL, *cie = NULL; const u8 *ptr = NULL, *end = NULL; + unsigned long pc = UNW_PC(frame) - frame->call_frame; unsigned long startLoc = 0, endLoc = 0, cfa; unsigned i; signed ptrType = -1; @@ -612,7 +613,7 @@ int unwind(struct unwind_frame_info *frame) if (UNW_PC(frame) == 0) return -EINVAL; - if ((table = find_table(UNW_PC(frame))) != NULL + if ((table = find_table(pc)) != NULL && !(table->size & (sizeof(*fde) - 1))) { unsigned long tableSize = table->size; @@ -647,7 +648,7 @@ int unwind(struct unwind_frame_info *frame) ptrType & DW_EH_PE_indirect ? ptrType : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); - if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) + if (pc >= startLoc && pc < endLoc) break; cie = NULL; } @@ -657,16 +658,28 @@ int unwind(struct unwind_frame_info *frame) state.cieEnd = ptr; /* keep here temporarily */ ptr = (const u8 *)(cie + 2); end = (const u8 *)(cie + 1) + *cie; + frame->call_frame = 1; if ((state.version = *ptr) != 1) cie = NULL; /* unsupported version */ else if (*++ptr) { /* check if augmentation size is first (and thus present) */ if (*ptr == 'z') { - /* check for ignorable (or already handled) - * nul-terminated augmentation string */ - while (++ptr < end && *ptr) - if (strchr("LPR", *ptr) == NULL) + while (++ptr < end && *ptr) { + switch(*ptr) { + /* check for ignorable (or already handled) + * nul-terminated augmentation string */ + case 'L': + case 'P': + case 'R': + continue; + case 'S': + frame->call_frame = 0; + continue; + default: break; + } + break; + } } if (ptr >= end || *ptr) cie = NULL; @@ -755,7 +768,7 @@ int unwind(struct unwind_frame_info *frame) state.org = startLoc; memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); /* process instructions */ - if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) + if (!processCFI(ptr, end, pc, ptrType, &state) || state.loc > endLoc || state.regs[retAddrReg].where == Nowhere || state.cfa.reg >= ARRAY_SIZE(reg_info) @@ -763,6 +776,11 @@ int unwind(struct unwind_frame_info *frame) || state.cfa.offs % sizeof(unsigned long)) return -EIO; /* update frame */ +#ifndef CONFIG_AS_CFI_SIGNAL_FRAME + if(frame->call_frame + && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign)) + frame->call_frame = 0; +#endif cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; startLoc = min((unsigned long)UNW_SP(frame), cfa); endLoc = max((unsigned long)UNW_SP(frame), cfa); @@ -866,6 +884,7 @@ int unwind_init_frame_info(struct unwind_frame_info *info, /*const*/ struct pt_regs *regs) { info->task = tsk; + info->call_frame = 0; arch_unw_init_frame_info(info, regs); return 0; @@ -879,6 +898,7 @@ int unwind_init_blocked(struct unwind_frame_info *info, struct task_struct *tsk) { info->task = tsk; + info->call_frame = 0; arch_unw_init_blocked(info); return 0; @@ -894,6 +914,7 @@ int unwind_init_running(struct unwind_frame_info *info, void *arg) { info->task = current; + info->call_frame = 0; return arch_unwind_init_running(info, callback, arg); } diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 7adef12a0c26..4f5ff19b992b 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -66,8 +66,8 @@ as-option = $(shell if $(CC) $(CFLAGS) $(1) -Wa,-Z -c -o /dev/null \ # as-instr # Usage: cflags-y += $(call as-instr, instr, option1, option2) -as-instr = $(shell if echo -e "$(1)" | $(AS) -Z -o astest$$$$.out \ - 2>&1 >/dev/null ; then echo "$(2)"; else echo "$(3)"; fi; \ +as-instr = $(shell if echo -e "$(1)" | $(AS) >/dev/null 2>&1 -W -Z -o astest$$$$.out ; \ + then echo "$(2)"; else echo "$(3)"; fi; \ rm -f astest$$$$.out) # cc-option -- cgit v1.2.3 From 658fdbef66e5e9be79b457edc2cbbb3add840aa9 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] Don't leak NT bit into next task SYSENTER can cause a NT to be set which might cause crashes on the IRET in the next task. Following similar i386 patch from Linus. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/kernel/entry.S | 4 ++++ arch/x86_64/kernel/setup64.c | 4 ++++ include/asm-x86_64/system.h | 5 +++-- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 4cbc65290ae7..2802524104f3 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -155,6 +155,10 @@ /* rdi: prev */ ENTRY(ret_from_fork) CFI_DEFAULT_STACK + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 4 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 9332d2361e08..8c4b80fe71a1 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -180,6 +180,8 @@ void __cpuinit check_efer(void) } } +unsigned long kernel_eflags; + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -281,4 +283,6 @@ void __cpuinit cpu_init (void) set_debugreg(0UL, 7); fpu_init(); + + raw_local_save_flags(kernel_eflags); } diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h index 6bf170bceae1..bd376bc8c4ab 100644 --- a/include/asm-x86_64/system.h +++ b/include/asm-x86_64/system.h @@ -14,12 +14,13 @@ #define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\n\t" +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" #define __EXTRA_CLOBBER \ ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" +/* Save restore flags to clear handle leaking NT */ #define switch_to(prev,next,last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ -- cgit v1.2.3 From f157cbb1eb9ce3f33a401ec6d20eb3eb852351a3 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] i386/x86-64: Make all early PCI scans dependent on CONFIG_PCI This is useful on systems with broken PCI bus. Affects various scans in x86-64 and i386's early ACPI quirk scan. Cc: gregkh@suse.de Cc: len.brown@intel.com Cc: Trammell Hudson <hudson@osresearch.net> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/acpi/Makefile | 2 ++ arch/i386/kernel/setup.c | 2 ++ arch/x86_64/Kconfig | 3 ++- arch/x86_64/kernel/Makefile | 3 ++- arch/x86_64/kernel/setup.c | 2 ++ 5 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index 7e9ac99354f4..7f7be01f44e6 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o +ifneq ($(CONFIG_PCI),) obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o +endif obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index ea17567dbe72..7a99b1369fa2 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1437,9 +1437,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif +#ifdef CONFIG_PCI #ifdef CONFIG_X86_IO_APIC check_acpi_pci(); /* Checks more than just ACPI actually */ #endif +#endif #ifdef CONFIG_ACPI acpi_boot_init(); diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 1f4212605ef8..c2c68b902347 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -105,6 +105,7 @@ config X86_PC config X86_VSMP bool "Support for ScaleMP vSMP" + depends on PCI help Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is supposed to run on these EM64T-based machines. Only choose this option @@ -291,7 +292,7 @@ config NUMA config K8_NUMA bool "Old style AMD Opteron NUMA detection" - depends on NUMA + depends on NUMA && PCI default y help Enable K8 NUMA node topology detection. You should say Y here if diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 000e67e8f028..2466fbd035ee 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o early-quirks.o + pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o @@ -39,6 +39,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 26524ce3b239..3d8309b1236e 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -493,7 +493,9 @@ void __init setup_arch(char **cmdline_p) paging_init(); +#ifdef CONFIG_PCI early_quirks(); +#endif /* * set this early, so we dont allocate cpu0 -- cgit v1.2.3 From 8f60774a116ced9b73ae3913d511687889efe725 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: Move direct PCI scanning functions out of line Saves about 200 bytes of code space. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/pci/Makefile | 2 +- arch/i386/pci/early.c | 44 +++++++++++++++++++++++++++++++++++++++++ arch/x86_64/pci/Makefile | 3 ++- include/asm-x86_64/pci-direct.h | 42 ++++----------------------------------- 4 files changed, 51 insertions(+), 40 deletions(-) create mode 100644 arch/i386/pci/early.c (limited to 'arch/x86_64') diff --git a/arch/i386/pci/Makefile b/arch/i386/pci/Makefile index 62ad75c57e6a..1594d2f55c8f 100644 --- a/arch/i386/pci/Makefile +++ b/arch/i386/pci/Makefile @@ -11,4 +11,4 @@ pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o -obj-y += $(pci-y) common.o +obj-y += $(pci-y) common.o early.o diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c new file mode 100644 index 000000000000..b1f7f40d809b --- /dev/null +++ b/arch/i386/pci/early.c @@ -0,0 +1,44 @@ +#include <linux/kernel.h> +#include <asm/pci-direct.h> +#include <asm/io.h> + +/* Direct PCI access. This is used for PCI accesses in early boot before + the PCI subsystem works. */ + +#define PDprintk(x...) + +u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) +{ + u32 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inl(0xcfc); + if (v != 0xffffffff) + PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); + return v; +} + +u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) +{ + u8 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inb(0xcfc + (offset&3)); + PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); + return v; +} + +u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) +{ + u16 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inw(0xcfc + (offset&2)); + PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); + return v; +} + +void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, + u32 val) +{ + PDprintk("%x writing to %x: %x\n", slot, offset, val); + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + outl(val, 0xcfc); +} diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile index a3f6ad570179..1eb18f421edf 100644 --- a/arch/x86_64/pci/Makefile +++ b/arch/x86_64/pci/Makefile @@ -9,7 +9,7 @@ obj-y := i386.o obj-$(CONFIG_PCI_DIRECT)+= direct.o obj-y += fixup.o init.o obj-$(CONFIG_ACPI) += acpi.o -obj-y += legacy.o irq.o common.o +obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o @@ -23,3 +23,4 @@ common-y += ../../i386/pci/common.o fixup-y += ../../i386/pci/fixup.o i386-y += ../../i386/pci/i386.o init-y += ../../i386/pci/init.o +early-y += ../../i386/pci/early.o diff --git a/include/asm-x86_64/pci-direct.h b/include/asm-x86_64/pci-direct.h index 036b6ca5b53b..9d916cdaa18e 100644 --- a/include/asm-x86_64/pci-direct.h +++ b/include/asm-x86_64/pci-direct.h @@ -2,47 +2,13 @@ #define ASM_PCI_DIRECT_H 1 #include <linux/types.h> -#include <asm/io.h> /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ -#define PDprintk(x...) - -static inline u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) -{ - u32 v; - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - v = inl(0xcfc); - if (v != 0xffffffff) - PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); - return v; -} - -static inline u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) -{ - u8 v; - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - v = inb(0xcfc + (offset&3)); - PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); - return v; -} - -static inline u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) -{ - u16 v; - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - v = inw(0xcfc + (offset&2)); - PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); - return v; -} - -static inline void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, - u32 val) -{ - PDprintk("%x writing to %x: %x\n", slot, offset, val); - outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); - outl(val, 0xcfc); -} +extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset); +extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset); +extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset); +extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val); #endif -- cgit v1.2.3 From 0637a70a5db98182d9ad3d6ae1ee30acf20afde9 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:41 +0200 Subject: [PATCH] x86: Allow disabling early pci scans with pci=noearly or disallowing conf1 Some buggy systems can machine check when config space accesses happen for some non existent devices. i386/x86-64 do some early device scans that might trigger this. Allow pci=noearly to disable this. Also when type 1 is disabling also don't do any early accesses which are always type1. This moves the pci= configuration parsing to be a early parameter. I don't think this can break anything because it only changes a single global that is only used by PCI. Cc: gregkh@suse.de Cc: Trammell Hudson <hudson@osresearch.net> Signed-off-by: Andi Kleen <ak@suse.de> --- Documentation/kernel-parameters.txt | 6 +++++- arch/i386/kernel/acpi/earlyquirk.c | 6 +++++- arch/i386/pci/common.c | 4 ++++ arch/i386/pci/early.c | 8 ++++++++ arch/i386/pci/pci.h | 1 + arch/x86_64/kernel/aperture.c | 2 +- arch/x86_64/kernel/early-quirks.c | 4 ++++ arch/x86_64/kernel/pci-calgary.c | 3 +++ arch/x86_64/kernel/vsmp.c | 3 +++ arch/x86_64/mm/k8topology.c | 3 +++ drivers/pci/pci.c | 5 ++--- include/asm-x86_64/pci-direct.h | 2 ++ 12 files changed, 41 insertions(+), 6 deletions(-) (limited to 'arch/x86_64') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 71d05f481727..4ae99f6e9938 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1240,7 +1240,11 @@ running once the system is up. bootloader. This is currently used on IXP2000 systems where the bus has to be configured a certain way for adjunct CPUs. - + noearly [X86] Don't do any early type 1 scanning. + This might help on some broken boards which + machine check when some devices' config space + is read. But various workarounds are disabled + and some IOMMU drivers will not work. pcmv= [HW,PCMCIA] BadgePAD 4 pd. [PARIDE] diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 1649a175a206..fe799b11ac0a 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -48,7 +48,11 @@ void __init check_acpi_pci(void) int num, slot, func; /* Assume the machine supports type 1. If not it will - always read ffffffff and should not have any side effect. */ + always read ffffffff and should not have any side effect. + Actually a few buggy systems can machine check. Allow the user + to disable it by command line option at least -AK */ + if (!early_pci_allowed()) + return; /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 0a362e3aeac5..68bce194e688 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -242,6 +242,10 @@ char * __devinit pcibios_setup(char *str) acpi_noirq_set(); return NULL; } + else if (!strcmp(str, "noearly")) { + pci_probe |= PCI_PROBE_NOEARLY; + return NULL; + } #ifndef CONFIG_X86_VISWS else if (!strcmp(str, "usepirqmask")) { pci_probe |= PCI_USE_PIRQ_MASK; diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c index b1f7f40d809b..713d6c866cae 100644 --- a/arch/i386/pci/early.c +++ b/arch/i386/pci/early.c @@ -1,6 +1,8 @@ #include <linux/kernel.h> +#include <linux/pci.h> #include <asm/pci-direct.h> #include <asm/io.h> +#include "pci.h" /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ @@ -42,3 +44,9 @@ void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outl(val, 0xcfc); } + +int early_pci_allowed(void) +{ + return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == + PCI_PROBE_CONF1; +} diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index 8a7cf1f23684..1814f74569c6 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -17,6 +17,7 @@ #define PCI_PROBE_CONF2 0x0004 #define PCI_PROBE_MMCONF 0x0008 #define PCI_PROBE_MASK 0x000f +#define PCI_PROBE_NOEARLY 0x0010 #define PCI_NO_SORT 0x0100 #define PCI_BIOS_SORT 0x0200 diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index f851e1f9c82a..b487396c4c5b 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -212,7 +212,7 @@ void __init iommu_hole_init(void) u64 aper_base, last_aper_base = 0; int valid_agp = 0; - if (iommu_aperture_disabled || !fix_aperture) + if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return; printk("Checking aperture...\n"); diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index d637cff1c4b1..208e38a372c1 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -82,6 +82,10 @@ static struct chipset early_qrk[] = { void __init early_quirks(void) { int num, slot, func; + + if (!early_pci_allowed()) + return; + /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 466588f95601..cfb09b07ae99 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -924,6 +924,9 @@ void __init detect_calgary(void) if (swiotlb || no_iommu || iommu_detected) return; + if (!early_pci_allowed()) + return; + specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c index 92f70c74965f..044e852bd25e 100644 --- a/arch/x86_64/kernel/vsmp.c +++ b/arch/x86_64/kernel/vsmp.c @@ -20,6 +20,9 @@ static int __init vsmp_init(void) void *address; unsigned int cap, ctl; + if (!early_pci_allowed()) + return 0; + /* Check if we are running on a ScaleMP vSMP box */ if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index 7c45c2d2b8b2..5cf594f9230d 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -54,6 +54,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodes_clear(nodes_parsed); + if (!early_pci_allowed()) + return -1; + nb = find_northbridge(); if (nb < 0) return nb; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 9f79dd6d51ab..684deb6b03aa 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -953,13 +953,12 @@ static int __devinit pci_setup(char *str) } str = k; } - return 1; + return 0; } +early_param("pci", pci_setup); device_initcall(pci_init); -__setup("pci=", pci_setup); - #if defined(CONFIG_ISA) || defined(CONFIG_EISA) /* FIXME: Some boxes have multiple ISA bridges! */ struct pci_dev *isa_bridge; diff --git a/include/asm-x86_64/pci-direct.h b/include/asm-x86_64/pci-direct.h index 9d916cdaa18e..eba9cb471df3 100644 --- a/include/asm-x86_64/pci-direct.h +++ b/include/asm-x86_64/pci-direct.h @@ -11,4 +11,6 @@ extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset); extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset); extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val); +extern int early_pci_allowed(void); + #endif -- cgit v1.2.3 From 15d5f8398311f565682959daaca30e3ca7aea600 Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin <dmitriyz@google.com> Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Refactor thermal throttle processing Refactor the event processing (syslog messaging and rate limiting) into separate file therm_throt.c. This allows consistent reporting of CPU thermal throttle events. After ACK'ing the interrupt, if the event is current, the user (p4.c/mce_intel.c) calls therm_throt_process to log (and rate limit) the event. If that function returns 1, the user has the option to log things further (such as to mce_log in x86_64). AK: minor cleanup Signed-off-by: Dmitriy Zavin <dmitriyz@google.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/cpu/mcheck/Makefile | 2 +- arch/i386/kernel/cpu/mcheck/p4.c | 23 ++++-------- arch/i386/kernel/cpu/mcheck/therm_throt.c | 58 +++++++++++++++++++++++++++++++ arch/x86_64/kernel/Makefile | 4 +-- arch/x86_64/kernel/mce.c | 27 ++++++++++++++ arch/x86_64/kernel/mce_intel.c | 27 ++++---------- include/asm-i386/therm_throt.h | 6 ++++ include/asm-x86_64/mce.h | 2 ++ include/asm-x86_64/therm_throt.h | 1 + 9 files changed, 109 insertions(+), 41 deletions(-) create mode 100644 arch/i386/kernel/cpu/mcheck/therm_throt.c create mode 100644 include/asm-i386/therm_throt.h create mode 100644 include/asm-x86_64/therm_throt.h (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile index 30808f3d6715..f1ebe1c1c17a 100644 --- a/arch/i386/kernel/cpu/mcheck/Makefile +++ b/arch/i386/kernel/cpu/mcheck/Makefile @@ -1,2 +1,2 @@ -obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o +obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index b95f1b3d53aa..d83a669d376f 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -13,6 +13,8 @@ #include <asm/msr.h> #include <asm/apic.h> +#include <asm/therm_throt.h> + #include "mce.h" /* as supported by the P4/Xeon family */ @@ -44,25 +46,12 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) /* P4/Xeon Thermal transition interrupt handler */ static void intel_thermal_interrupt(struct pt_regs *regs) { - u32 l, h; - unsigned int cpu = smp_processor_id(); - static unsigned long next[NR_CPUS]; + __u64 msr_val; ack_APIC_irq(); - if (time_after(next[cpu], jiffies)) - return; - - next[cpu] = jiffies + HZ*5; - rdmsr(MSR_IA32_THERM_STATUS, l, h); - if (l & 0x1) { - printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); - printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", - cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); - } + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + therm_throt_process(msr_val & 0x1); } /* Thermal interrupt handler for this CPU setup */ @@ -122,7 +111,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) rdmsr (MSR_IA32_MISC_ENABLE, l, h); wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); - + l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c new file mode 100644 index 000000000000..85eba00d680e --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -0,0 +1,58 @@ +/* + * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c + * + * Thermal throttle event support code. + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * + */ + +#include <linux/percpu.h> +#include <linux/cpu.h> +#include <asm/cpu.h> +#include <linux/notifier.h> +#include <asm/therm_throt.h> + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +static DEFINE_PER_CPU(unsigned long, next_check); + +/*** + * therm_throt_process - Process thermal throttling event + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is normally called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + * + * Returns: 0 : Event should NOT be further logged, i.e. still in + * "timeout" from previous log message. + * 1 : Event should be logged further, and a message has been + * printed to the syslog. + */ +int therm_throt_process(int curr) +{ + unsigned int cpu = smp_processor_id(); + + if (time_before(jiffies, __get_cpu_var(next_check))) + return 0; + + __get_cpu_var(next_check) = jiffies + CHECK_INTERVAL; + + /* if we just entered the thermal event */ + if (curr) { + printk(KERN_CRIT "CPU%d: Temperature above threshold, " + "cpu clock throttled\n", cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } + + return 1; +} diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 2466fbd035ee..3c7cbff04d3d 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -11,7 +11,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ pci-dma.o pci-nommu.o alternative.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_X86_MCE) += mce.o +obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ @@ -46,6 +46,7 @@ obj-y += intel_cacheinfo.o CFLAGS_vsyscall.o := $(PROFILING) -g0 +therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/kernel/topology.o @@ -55,4 +56,3 @@ quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o - diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 1a93c3738404..bbea88801d88 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code) atomic_dec(&mce_entry); } +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +{ + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + /* * Periodic polling timer for "silent" machine check errors. */ diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 8f533d2c40cb..dec11219e278 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -11,36 +11,21 @@ #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/idle.h> - -static DEFINE_PER_CPU(unsigned long, next_check); +#include <asm/therm_throt.h> asmlinkage void smp_thermal_interrupt(void) { - struct mce m; + __u64 msr_val; ack_APIC_irq(); exit_idle(); irq_enter(); - if (time_before(jiffies, __get_cpu_var(next_check))) - goto done; - - __get_cpu_var(next_check) = jiffies + HZ*300; - memset(&m, 0, sizeof(m)); - m.cpu = smp_processor_id(); - m.bank = MCE_THERMAL_BANK; - rdtscll(m.tsc); - rdmsrl(MSR_IA32_THERM_STATUS, m.status); - if (m.status & 0x1) { - printk(KERN_EMERG - "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); - } - mce_log(&m); -done: + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & 1)) + mce_log_therm_throt_event(smp_processor_id(), msr_val); + irq_exit(); } diff --git a/include/asm-i386/therm_throt.h b/include/asm-i386/therm_throt.h new file mode 100644 index 000000000000..3c9c22cc10c9 --- /dev/null +++ b/include/asm-i386/therm_throt.h @@ -0,0 +1,6 @@ +#ifndef __ASM_I386_THERM_THROT_H__ +#define __ASM_I386_THERM_THROT_H__ 1 + +int therm_throt_process(int curr); + +#endif /* __ASM_I386_THERM_THROT_H__ */ diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h index d13687dfd691..5a11146d6d9c 100644 --- a/include/asm-x86_64/mce.h +++ b/include/asm-x86_64/mce.h @@ -99,6 +99,8 @@ static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) } #endif +void mce_log_therm_throt_event(unsigned int cpu, __u64 status); + extern atomic_t mce_entry; #endif diff --git a/include/asm-x86_64/therm_throt.h b/include/asm-x86_64/therm_throt.h new file mode 100644 index 000000000000..5aac059007ba --- /dev/null +++ b/include/asm-x86_64/therm_throt.h @@ -0,0 +1 @@ +#include <asm-i386/therm_throt.h> -- cgit v1.2.3 From 3222b36f46c22f46697a0a53fa8804153a32669f Mon Sep 17 00:00:00 2001 From: Dmitriy Zavin <dmitriyz@google.com> Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] x86: Add a cumulative thermal throttle event counter. The counter is exported to /sys that keeps track of the number of thermal events, such that the user knows how bad the thermal problem might be (since the logging to syslog and mcelog is rate limited). AK: Fixed cpu hotplug locking Signed-off-by: Dmitriy Zavin <dmitriyz@google.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/cpu/mcheck/p4.c | 3 + arch/i386/kernel/cpu/mcheck/therm_throt.c | 133 ++++++++++++++++++++++++++++-- arch/x86_64/kernel/mce_intel.c | 3 + include/asm-i386/therm_throt.h | 3 + 4 files changed, 136 insertions(+), 6 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index d83a669d376f..504434a46011 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -115,6 +115,9 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 101f7ace00ce..4f43047de406 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -1,15 +1,22 @@ /* * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c * - * Thermal throttle event support code. + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog and mcelog is rate limited). * * Author: Dmitriy Zavin (dmitriyz@google.com) * * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. - * + * Inspired by Ross Biro's and Al Borchers' counter code. */ #include <linux/percpu.h> +#include <linux/sysdev.h> #include <linux/cpu.h> #include <asm/cpu.h> #include <linux/notifier.h> @@ -18,15 +25,53 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check); +static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +atomic_t therm_throt_en = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_sysdev_show_func(count); +define_therm_throt_sysdev_one_ro(count); + +static struct attribute *thermal_throttle_attrs[] = { + &attr_count.attr, + NULL +}; + +static struct attribute_group thermal_throttle_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ /*** - * therm_throt_process - Process thermal throttling event + * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the * thermal interrupt normally gets called both when the thermal * event begins and once the event has ended. * - * This function is normally called by the thermal interrupt after the + * This function is called by the thermal interrupt after the * IRQ has been acknowledged. * * It will take care of rate limiting and printing messages to the syslog. @@ -41,6 +86,9 @@ int therm_throt_process(int curr) unsigned int cpu = smp_processor_id(); __u64 tmp_jiffs = get_jiffies_64(); + if (curr) + __get_cpu_var(thermal_throttle_count)++; + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) return 0; @@ -49,7 +97,9 @@ int therm_throt_process(int curr) /* if we just entered the thermal event */ if (curr) { printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled\n", cpu); + "cpu clock throttled (total events = %lu)\n", cpu, + __get_cpu_var(thermal_throttle_count)); + add_taint(TAINT_MACHINE_CHECK); } else { printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); @@ -57,3 +107,74 @@ int therm_throt_process(int curr) return 1; } + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device */ +static __cpuinit int thermal_throttle_add_dev(struct sys_device * sys_dev) +{ + sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit int thermal_throttle_remove_dev(struct sys_device * sys_dev) +{ + sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +/* Mutex protecting device creation against CPU hotplug */ +static DEFINE_MUTEX(therm_cpu_lock); + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + mutex_lock(&therm_cpu_lock); + switch (action) { + case CPU_ONLINE: + thermal_throttle_add_dev(sys_dev); + break; + case CPU_DEAD: + thermal_throttle_remove_dev(sys_dev); + break; + } + mutex_unlock(&therm_cpu_lock); + return NOTIFY_OK; +} + +static struct notifier_block thermal_throttle_cpu_notifier = +{ + .notifier_call = thermal_throttle_cpu_callback, +}; +#endif /* CONFIG_HOTPLUG_CPU */ + +static __init int thermal_throttle_init_device(void) +{ + unsigned int cpu = 0; + + if (!atomic_read(&therm_throt_en)) + return 0; + + register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + +#ifdef CONFIG_HOTPLUG_CPU + mutex_lock(&therm_cpu_lock); +#endif + /* connect live CPUs to sysfs */ + for_each_online_cpu(cpu) + thermal_throttle_add_dev(get_cpu_sysdev(cpu)); +#ifdef CONFIG_HOTPLUG_CPU + mutex_unlock(&therm_cpu_lock); +#endif + + return 0; +} + +device_initcall(thermal_throttle_init_device); +#endif /* CONFIG_SYSFS */ diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index dec11219e278..6551505d8a2c 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -77,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } diff --git a/include/asm-i386/therm_throt.h b/include/asm-i386/therm_throt.h index 3c9c22cc10c9..399bf6026b16 100644 --- a/include/asm-i386/therm_throt.h +++ b/include/asm-i386/therm_throt.h @@ -1,6 +1,9 @@ #ifndef __ASM_I386_THERM_THROT_H__ #define __ASM_I386_THERM_THROT_H__ 1 +#include <asm/atomic.h> + +extern atomic_t therm_throt_en; int therm_throt_process(int curr); #endif /* __ASM_I386_THERM_THROT_H__ */ -- cgit v1.2.3 From dcf10307c3fff2bca4b104ad8fe4e3aa8dcb2ad1 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] i386/x86-64: New Intel feature flags Add supplemental SSE3 instructions flag, and Direct Cache Access flag. As described in "Intel Processor idenfication and the CPUID instruction AP485 Sept 2006" AK: also added for x86-64 Signed-off-by: Dave Jones <davej@redhat.com> Signed-off-by: Andi Kleen <ak@suse.de> --- arch/i386/kernel/cpu/proc.c | 4 ++-- arch/x86_64/kernel/setup.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86_64') diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index f54a15268ed7..76aac088a323 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -46,8 +46,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 3d8309b1236e..3c7ad267d8a7 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -1071,8 +1071,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ -- cgit v1.2.3 From 3f75f42d7733e73aca5c78326489efd4189e0111 Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@suse.de> Date: Tue, 26 Sep 2006 10:52:42 +0200 Subject: [PATCH] Don't set calgary iommu as default y Most systems don't need it. Signed-off-by: Andi Kleen <ak@suse.de> --- arch/x86_64/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86_64') diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index c2c68b902347..294bf77f7222 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -422,7 +422,6 @@ config IOMMU config CALGARY_IOMMU bool "IBM Calgary IOMMU support" - default y select SWIOTLB depends on PCI && EXPERIMENTAL help -- cgit v1.2.3