diff options
Diffstat (limited to 'arch/x86')
120 files changed, 2441 insertions, 923 deletions
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index f35ea2237522..a0ae2e7f6cec 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -27,13 +27,22 @@ #include <asm/segment.h> #include <asm/page.h> #include <asm/boot.h> +#include <asm/asm-offsets.h> .section ".text.head","ax",@progbits .globl startup_32 startup_32: - cld - cli + /* check to see if KEEP_SEGMENTS flag is meaningful */ + cmpw $0x207, BP_version(%esi) + jb 1f + + /* test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments */ + testb $(1<<6), BP_loadflags(%esi) + jnz 2f + +1: cli movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es @@ -41,6 +50,8 @@ startup_32: movl %eax,%gs movl %eax,%ss +2: cld + /* Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc_32.c index b28505c544c9..b74d60d1b2fa 100644 --- a/arch/x86/boot/compressed/misc_32.c +++ b/arch/x86/boot/compressed/misc_32.c @@ -25,7 +25,7 @@ /* * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analized. + * Worst case behaviours need to be analyzed. * Background information: * * The file layout is: @@ -94,7 +94,7 @@ * Adding 32768 instead of 32767 just makes for round numbers. * Adding the decompressor_size is necessary as it musht live after all * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actuall data and 4K of bss. + * 10K of actual data and 4K of bss. * */ @@ -247,6 +247,9 @@ static void putstr(const char *s) int x,y,pos; char c; + if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) + return; + x = RM_SCREEN_INFO.orig_x; y = RM_SCREEN_INFO.orig_y; diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c index f932b0e89096..6ea015aa65e4 100644 --- a/arch/x86/boot/compressed/misc_64.c +++ b/arch/x86/boot/compressed/misc_64.c @@ -25,7 +25,7 @@ /* * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analized. + * Worst case behaviours need to be analyzed. * Background information: * * The file layout is: @@ -94,7 +94,7 @@ * Adding 32768 instead of 32767 just makes for round numbers. * Adding the decompressor_size is necessary as it musht live after all * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actuall data and 4K of bss. + * 10K of actual data and 4K of bss. * */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index f3140e596d40..8353c81c41c0 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -119,7 +119,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x0206 # header version number (>= 0x0105) + .word 0x0207 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -214,6 +214,11 @@ cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol #version 2.06 +hardware_subarch: .long 0 # subarchitecture, added with 2.07 + # default to 0 for normal x86 PC + +hardware_subarch_data: .quad 0 + # End of setup header ##################################################### .section ".inittext", "ax" diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c index 118b9f9ff499..55822d2cf053 100644 --- a/arch/x86/ia32/ia32_binfmt.c +++ b/arch/x86/ia32/ia32_binfmt.c @@ -5,10 +5,6 @@ * This tricks binfmt_elf.c into loading 32bit binaries using lots * of ugly preprocessor tricks. Talk about very very poor man's inheritance. */ -#define __ASM_X86_64_ELF_H 1 - -#undef ELF_CLASS -#define ELF_CLASS ELFCLASS32 #include <linux/types.h> #include <linux/stddef.h> @@ -19,6 +15,7 @@ #include <linux/binfmts.h> #include <linux/mm.h> #include <linux/security.h> +#include <linux/elfcore-compat.h> #include <asm/segment.h> #include <asm/ptrace.h> @@ -31,6 +28,20 @@ #include <asm/ia32.h> #include <asm/vsyscall32.h> +#undef ELF_ARCH +#undef ELF_CLASS +#define ELF_CLASS ELFCLASS32 +#define ELF_ARCH EM_386 + +#undef elfhdr +#undef elf_phdr +#undef elf_note +#undef elf_addr_t +#define elfhdr elf32_hdr +#define elf_phdr elf32_phdr +#define elf_note elf32_note +#define elf_addr_t Elf32_Off + #define ELF_NAME "elf/i386" #define AT_SYSINFO 32 @@ -48,74 +59,20 @@ int sysctl_vsyscall32 = 1; } while(0) struct file; -struct elf_phdr; #define IA32_EMULATOR 1 -#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) - -#undef ELF_ARCH -#define ELF_ARCH EM_386 - -#define ELF_DATA ELFDATA2LSB +#undef ELF_ET_DYN_BASE -#define USE_ELF_CORE_DUMP 1 - -/* Override elfcore.h */ -#define _LINUX_ELFCORE_H 1 -typedef unsigned int elf_greg_t; - -#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t)) -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -struct elf_siginfo -{ - int si_signo; /* signal number */ - int si_code; /* extra code */ - int si_errno; /* errno */ -}; +#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) #define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) -struct elf_prstatus -{ - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned int pr_sigpend; /* Set of pending signals */ - unsigned int pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct compat_timeval pr_utime; /* User time */ - struct compat_timeval pr_stime; /* System time */ - struct compat_timeval pr_cutime; /* Cumulative user time */ - struct compat_timeval pr_cstime; /* Cumulative system time */ - elf_gregset_t pr_reg; /* GP registers */ - int pr_fpvalid; /* True if math co-processor being used. */ -}; - -#define ELF_PRARGSZ (80) /* Number of chars for args */ - -struct elf_prpsinfo -{ - char pr_state; /* numeric process state */ - char pr_sname; /* char for pr_state */ - char pr_zomb; /* zombie */ - char pr_nice; /* nice val */ - unsigned int pr_flag; /* flags */ - __u16 pr_uid; - __u16 pr_gid; - pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; - /* Lots missing */ - char pr_fname[16]; /* filename of executable */ - char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ -}; - #define _GET_SEG(x) \ ({ __u32 seg; asm("movl %%" __stringify(x) ",%0" : "=r"(seg)); seg; }) /* Assumes current==process to be dumped */ +#undef ELF_CORE_COPY_REGS #define ELF_CORE_COPY_REGS(pr_reg, regs) \ pr_reg[0] = regs->rbx; \ pr_reg[1] = regs->rcx; \ @@ -135,36 +92,41 @@ struct elf_prpsinfo pr_reg[15] = regs->rsp; \ pr_reg[16] = regs->ss; -#define user user32 + +#define elf_prstatus compat_elf_prstatus +#define elf_prpsinfo compat_elf_prpsinfo +#define elf_fpregset_t struct user_i387_ia32_struct +#define elf_fpxregset_t struct user32_fxsr_struct +#define user user32 #undef elf_read_implies_exec #define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) -//#include <asm/ia32.h> -#include <linux/elf.h> - -typedef struct user_i387_ia32_struct elf_fpregset_t; -typedef struct user32_fxsr_struct elf_fpxregset_t; - -static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs) +#define elf_core_copy_regs elf32_core_copy_regs +static inline void elf32_core_copy_regs(compat_elf_gregset_t *elfregs, + struct pt_regs *regs) { - ELF_CORE_COPY_REGS((*elfregs), regs) + ELF_CORE_COPY_REGS((&elfregs->ebx), regs) } -static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) +#define elf_core_copy_task_regs elf32_core_copy_task_regs +static inline int elf32_core_copy_task_regs(struct task_struct *t, + compat_elf_gregset_t* elfregs) { struct pt_regs *pp = task_pt_regs(t); - ELF_CORE_COPY_REGS((*elfregs), pp); + ELF_CORE_COPY_REGS((&elfregs->ebx), pp); /* fix wrong segments */ - (*elfregs)[7] = t->thread.ds; - (*elfregs)[9] = t->thread.fsindex; - (*elfregs)[10] = t->thread.gsindex; - (*elfregs)[8] = t->thread.es; + elfregs->ds = t->thread.ds; + elfregs->fs = t->thread.fsindex; + elfregs->gs = t->thread.gsindex; + elfregs->es = t->thread.es; return 1; } +#define elf_core_copy_task_fpregs elf32_core_copy_task_fpregs static inline int -elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu) +elf32_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, + elf_fpregset_t *fpu) { struct _fpstate_ia32 *fpstate = (void*)fpu; mm_segment_t oldfs = get_fs(); @@ -186,8 +148,9 @@ elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpr #define ELF_CORE_COPY_XFPREGS 1 #define ELF_CORE_XFPREG_TYPE NT_PRXFPREG +#define elf_core_copy_task_xfpregs elf32_core_copy_task_xfpregs static inline int -elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) +elf32_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) { struct pt_regs *regs = task_pt_regs(t); if (!tsk_used_math(t)) @@ -206,6 +169,10 @@ elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) extern int force_personality32; +#undef ELF_EXEC_PAGESIZE +#undef ELF_HWCAP +#undef ELF_PLATFORM +#undef SET_PERSONALITY #define ELF_EXEC_PAGESIZE PAGE_SIZE #define ELF_HWCAP (boot_cpu_data.x86_capability[0]) #define ELF_PLATFORM ("i686") @@ -231,6 +198,7 @@ do { \ #define load_elf_binary load_elf32_binary +#undef ELF_PLAT_INIT #define ELF_PLAT_INIT(r, load_addr) elf32_init(r) #undef start_thread @@ -289,7 +257,6 @@ static void elf32_init(struct pt_regs *regs) static ctl_table abi_table2[] = { { - .ctl_name = 99, .procname = "vsyscall32", .data = &sysctl_vsyscall32, .maxlen = sizeof(int), diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 index a3fa11f8f460..ccea590bbb92 100644 --- a/arch/x86/kernel/Makefile_32 +++ b/arch/x86/kernel/Makefile_32 @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_32.o init_task_32.o vmlinux.lds +extra-y := head_32.o init_task.o vmlinux.lds obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \ ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \ @@ -17,6 +17,7 @@ obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_PCI) += early-quirks.o obj-$(CONFIG_APM) += apm_32.o obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o obj-$(CONFIG_SMP) += smpcommon_32.o diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64 index 43da66213a47..dec06e769281 100644 --- a/arch/x86/kernel/Makefile_64 +++ b/arch/x86/kernel/Makefile_64 @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_64.o head64.o init_task_64.o vmlinux.lds +extra-y := head_64.o head64.o init_task.o vmlinux.lds EXTRA_AFLAGS := -traditional obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \ ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \ @@ -39,7 +39,7 @@ obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_MODULES) += module_64.o -obj-$(CONFIG_PCI) += early-quirks_64.o +obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o diff --git a/arch/x86/kernel/acpi/Makefile_32 b/arch/x86/kernel/acpi/Makefile_32 index a4852a2e9190..045dd54b33e0 100644 --- a/arch/x86/kernel/acpi/Makefile_32 +++ b/arch/x86/kernel/acpi/Makefile_32 @@ -1,7 +1,4 @@ obj-$(CONFIG_ACPI) += boot.o -ifneq ($(CONFIG_PCI),) -obj-$(CONFIG_X86_IO_APIC) += earlyquirk_32.o -endif obj-$(CONFIG_ACPI_SLEEP) += sleep_32.o wakeup_32.o ifneq ($(CONFIG_ACPI_PROCESSOR),) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index afd2afe9102d..289247d974c6 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -99,7 +99,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; /* * The default interrupt routing model is PIC (8259). This gets - * overriden if IOAPICs are enumerated (below). + * overridden if IOAPICs are enumerated (below). */ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; @@ -414,8 +414,8 @@ acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end * * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. - * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0) - * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0) + * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) + * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0) */ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) @@ -427,7 +427,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) old = inb(0x4d0) | (inb(0x4d1) << 8); /* - * If we use ACPI to set PCI irq's, then we should clear ELCR + * If we use ACPI to set PCI IRQs, then we should clear ELCR * since we will set it correctly as we enable the PCI irq * routing. */ @@ -555,7 +555,7 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { - x86_cpu_to_apicid[cpu] = -1; + per_cpu(x86_cpu_to_apicid, cpu) = -1; cpu_clear(cpu, cpu_present_map); num_processors--; diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 2d39f55d29a8..10b67170b133 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -29,7 +29,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, unsigned int cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); flags->bm_check = 0; if (num_online_cpus() == 1) @@ -72,7 +72,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct acpi_processor_cx *cx, struct acpi_power_register *reg) { struct cstate_entry *percpu_entry; - struct cpuinfo_x86 *c = cpu_data + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); cpumask_t saved_mask; int retval; diff --git a/arch/x86/kernel/acpi/earlyquirk_32.c b/arch/x86/kernel/acpi/earlyquirk_32.c deleted file mode 100644 index 23f78efc577d..000000000000 --- a/arch/x86/kernel/acpi/earlyquirk_32.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Do early PCI probing for bug detection when the main PCI subsystem is - * not up yet. - */ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/acpi.h> - -#include <asm/pci-direct.h> -#include <asm/acpi.h> -#include <asm/apic.h> - -#ifdef CONFIG_ACPI - -static int __init nvidia_hpet_check(struct acpi_table_header *header) -{ - return 0; -} -#endif - -static int __init check_bridge(int vendor, int device) -{ -#ifdef CONFIG_ACPI - static int warned; - /* According to Nvidia all timer overrides are bogus unless HPET - is enabled. */ - if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) { - if (!warned && acpi_table_parse(ACPI_SIG_HPET, - nvidia_hpet_check)) { - warned = 1; - acpi_skip_timer_override = 1; - printk(KERN_INFO "Nvidia board " - "detected. Ignoring ACPI " - "timer override.\n"); - printk(KERN_INFO "If you got timer trouble " - "try acpi_use_timer_override\n"); - - } - } -#endif - if (vendor == PCI_VENDOR_ID_ATI && timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO "ATI board detected. Disabling timer routing " - "over 8254.\n"); - } - return 0; -} - -void __init check_acpi_pci(void) -{ - int num, slot, func; - - /* Assume the machine supports type 1. If not it will - always read ffffffff and should not have any side effect. - Actually a few buggy systems can machine check. Allow the user - to disable it by command line option at least -AK */ - if (!early_pci_allowed()) - return; - - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - class = read_pci_config(num, slot, func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - - if (check_bridge(vendor & 0xffff, vendor >> 16)) - return; - } - - } - } -} diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index b54fded49834..2ed0a4ce62f0 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -63,7 +63,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) void arch_acpi_processor_init_pdc(struct acpi_processor *pr) { unsigned int cpu = pr->id; - struct cpuinfo_x86 *c = cpu_data + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); pr->pdc = NULL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index f22ba8534d26..a97313b1270e 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -11,7 +11,7 @@ # # If physical address of wakeup_code is 0x12345, BIOS should call us with # cs = 0x1234, eip = 0x05 -# +# #define BEEP \ inb $97, %al; \ @@ -52,7 +52,6 @@ wakeup_code: BEEP 1: mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board - movw $0x0e00 + 'S', %fs:(0x12) pushl $0 # Kill any dangerous flags popfl @@ -90,9 +89,6 @@ wakeup_code: # make sure %cr4 is set correctly (features, etc) movl real_save_cr4 - wakeup_code, %eax movl %eax, %cr4 - movw $0xb800, %ax - movw %ax,%fs - movw $0x0e00 + 'i', %fs:(0x12) # need a gdt -- use lgdtl to force 32-bit operands, in case # the GDT is located past 16 megabytes. @@ -102,8 +98,6 @@ wakeup_code: movl %eax, %cr0 jmp 1f 1: - movw $0x0e00 + 'n', %fs:(0x14) - movl real_magic - wakeup_code, %eax cmpl $0x12345678, %eax jne bogus_real_magic @@ -122,13 +116,11 @@ real_save_cr4: .long 0 real_magic: .long 0 video_mode: .long 0 realmode_flags: .long 0 -beep_flags: .long 0 real_efer_save_restore: .long 0 real_save_efer_edx: .long 0 real_save_efer_eax: .long 0 bogus_real_magic: - movw $0x0e00 + 'B', %fs:(0x12) jmp bogus_real_magic /* This code uses an extended set of video mode numbers. These include: @@ -194,7 +186,6 @@ wakeup_pmode_return: movw %ax, %es movw %ax, %fs movw %ax, %gs - movw $0x0e00 + 'u', 0xb8016 # reload the gdt, as we need the full 32 bit address lgdt saved_gdt @@ -218,7 +209,6 @@ wakeup_pmode_return: jmp *%eax bogus_magic: - movw $0x0e00 + 'B', 0xb8018 jmp bogus_magic diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 8b4357e1efe0..55608ec2ed72 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -41,7 +41,6 @@ wakeup_code: # Running in *copy* of this code, somewhere in low 1MB. - movb $0xa1, %al ; outb %al, $0x80 cli cld # setup data segment @@ -65,11 +64,6 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - call verify_cpu # Verify the cpu supports long - # mode - testl %eax, %eax - jnz no_longmode - testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 @@ -84,12 +78,6 @@ wakeup_code: call mode_set 1: - movw $0xb800, %ax - movw %ax,%fs - movw $0x0e00 + 'L', %fs:(0x10) - - movb $0xa2, %al ; outb %al, $0x80 - mov %ds, %ax # Find 32bit wakeup_code addr movzx %ax, %esi # (Convert %ds:gdt to a liner ptr) shll $4, %esi @@ -117,14 +105,10 @@ wakeup_32_vector: .code32 wakeup_32: # Running in this code, but at low address; paging is not yet turned on. - movb $0xa5, %al ; outb %al, $0x80 movl $__KERNEL_DS, %eax movl %eax, %ds - movw $0x0e00 + 'i', %ds:(0xb8012) - movb $0xa8, %al ; outb %al, $0x80; - /* * Prepare for entering 64bits mode */ @@ -200,16 +184,11 @@ wakeup_long64: */ lgdt cpu_gdt_descr - movw $0x0e00 + 'n', %ds:(0xb8014) - movb $0xa9, %al ; outb %al, $0x80 - movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax jne bogus_64_magic - movw $0x0e00 + 'u', %ds:(0xb8016) - nop nop movw $__KERNEL_DS, %ax @@ -220,13 +199,11 @@ wakeup_long64: movw %ax, %gs movq saved_rsp, %rsp - movw $0x0e00 + 'x', %ds:(0xb8018) movq saved_rbx, %rbx movq saved_rdi, %rdi movq saved_rsi, %rsi movq saved_rbp, %rbp - movw $0x0e00 + '!', %ds:(0xb801a) movq saved_rip, %rax jmp *%rax @@ -256,21 +233,12 @@ realmode_flags: .quad 0 .code16 bogus_real_magic: - movb $0xba,%al ; outb %al,$0x80 jmp bogus_real_magic .code64 bogus_64_magic: - movb $0xb3,%al ; outb %al,$0x80 jmp bogus_64_magic -.code16 -no_longmode: - movb $0xbc,%al ; outb %al,$0x80 - jmp no_longmode - -#include "../verify_cpu_64.S" - /* This code uses an extended set of video mode numbers. These include: * Aliases for standard modes * NORMAL_VGA (-1) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3bd2688bd443..d6405e0842b5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -357,14 +357,14 @@ void alternatives_smp_switch(int smp) if (smp) { printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); @@ -432,7 +432,7 @@ void __init alternative_instructions(void) if (1 == num_possible_cpus()) { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 793341fffc81..08b07c176962 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -947,7 +947,7 @@ void __devinit setup_local_APIC(void) * Set up LVT0, LVT1: * * set up through-local-APIC on the BP's LINT0. This is not - * strictly necessery in pure symmetric-IO mode, but sometimes + * strictly necessary in pure symmetric-IO mode, but sometimes * we delegate interrupts to the 8259A. */ /* @@ -998,7 +998,7 @@ void __devinit setup_local_APIC(void) } else { if (esr_disable) /* - * Something untraceble is creating bad interrupts on + * Something untraceable is creating bad interrupts on * secondary quads ... for the moment, just leave the * ESR disabled - we can't do anything useful with the * errors anyway - mbligh diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 32f2365c26ed..17089a041028 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -57,7 +57,7 @@ * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 * 1.2a:Simple change to stop mysterious bug reports with SMP also added * levels to the printk calls. APM is not defined for SMP machines. - * The new replacment for it is, but Linux doesn't yet support this. + * The new replacement for it is, but Linux doesn't yet support this. * Alan Cox Linux 2.1.55 * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index f1b7cdda82b3..0e45981b2dd7 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -15,6 +15,7 @@ #include <asm/fixmap.h> #include <asm/processor.h> #include <asm/thread_info.h> +#include <asm/bootparam.h> #include <asm/elf.h> #include <xen/interface/xen.h> @@ -135,6 +136,7 @@ void foo(void) #ifdef CONFIG_LGUEST_GUEST BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); @@ -146,4 +148,10 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5f8af875f457..1ff88c7f45cf 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -266,7 +266,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_HT /* * On a AMD multi core setup the lower bits of the APIC id - * distingush the cores. + * distinguish the cores. */ if (c->x86_max_cores > 1) { int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 473eac883c7b..9681fa15ddf0 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -53,7 +53,7 @@ static u32 __cpuinit ramtop(void) /* 16388 */ continue; /* * Don't MCR over reserved space. Ignore the ISA hole - * we frob around that catastrophy already + * we frob around that catastrophe already */ if (e820.map[i].type == E820_RESERVED) @@ -287,7 +287,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) c->x86_capability[5] = cpuid_edx(0xC0000001); } - /* Cyrix III family needs CX8 & PGE explicity enabled. */ + /* Cyrix III family needs CX8 & PGE explicitly enabled. */ if (c->x86_model >=6 && c->x86_model <= 9) { rdmsr (MSR_VIA_FCR, lo, hi); lo |= (1<<1 | 1<<7); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d506201d397c..e2fcf2051bdb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -207,7 +207,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) static int __init x86_fxsr_setup(char * s) { - /* Tell all the other CPU's to not use it... */ + /* Tell all the other CPUs to not use it... */ disable_x86_fxsr = 1; /* diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig_32 index d8c6f132dc7a..d8c6f132dc7a 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig_32 diff --git a/arch/x86/kernel/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig_64 index a3fd51926cbd..9c9699fdcf52 100644 --- a/arch/x86/kernel/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig_64 @@ -19,7 +19,7 @@ config X86_POWERNOW_K8 To compile this driver as a module, choose M here: the module will be called powernow-k8. - For details, take a look at <file:Documentation/cpu-freq/>. + For details, take a look at <file:Documentation/cpu-freq/>. If in doubt, say N. diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 2ca43ba32bc0..fea0af0476b9 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -77,7 +77,7 @@ static unsigned int acpi_pstate_strict; static int check_est_cpu(unsigned int cpuid) { - struct cpuinfo_x86 *cpu = &cpu_data[cpuid]; + struct cpuinfo_x86 *cpu = &cpu_data(cpuid); if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) @@ -560,7 +560,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) unsigned int cpu = policy->cpu; struct acpi_cpufreq_data *data; unsigned int result = 0; - struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; + struct cpuinfo_x86 *c = &cpu_data(policy->cpu); struct acpi_processor_performance *perf; dprintk("acpi_cpufreq_cpu_init\n"); diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index 32f0bda3fc95..f03e9153618e 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -260,7 +260,7 @@ static int nforce2_target(struct cpufreq_policy *policy, freqs.old = nforce2_get(policy->cpu); freqs.new = target_fsb * fid * 100; - freqs.cpu = 0; /* Only one CPU on nForce2 plattforms */ + freqs.cpu = 0; /* Only one CPU on nForce2 platforms */ if (freqs.old == freqs.new) return 0; diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c index c11baaf9f2b4..326a4c81f684 100644 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c @@ -305,7 +305,7 @@ static struct cpufreq_driver eps_driver = { static int __init eps_init(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); /* This driver will work only on Centaur C7 processors with * Enhanced SpeedStep/PowerSaver registers */ diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 1e7ae7dafcf6..94619c22f563 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -199,7 +199,7 @@ static int elanfreq_target (struct cpufreq_policy *policy, static int elanfreq_cpu_init(struct cpufreq_policy *policy) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); unsigned int i; int result; @@ -280,7 +280,7 @@ static struct cpufreq_driver elanfreq_driver = { static int __init elanfreq_init(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); /* Test if we have the right hardware */ if ((c->x86_vendor != X86_VENDOR_AMD) || diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index ed2bda127c44..2ed7db2fd257 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -12,12 +12,12 @@ * of any nature resulting due to the use of this software. This * software is provided AS-IS with no warranties. * - * Theoritical note: + * Theoretical note: * * (see Geode(tm) CS5530 manual (rev.4.1) page.56) * * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0 - * are based on Suspend Moduration. + * are based on Suspend Modulation. * * Suspend Modulation works by asserting and de-asserting the SUSP# pin * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP# @@ -101,11 +101,11 @@ /* SUSCFG bits */ #define SUSMOD (1<<0) /* enable/disable suspend modulation */ -/* the belows support only with cs5530 (after rev.1.2)/cs5530A */ +/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */ #define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */ /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */ #define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */ -/* the belows support only with cs5530A */ +/* the below is supported only with cs5530A */ #define PWRSVE_ISA (1<<3) /* stop ISA clock */ #define PWRSVE (1<<4) /* active idle */ diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 5045f5d583c8..749d00cb2ebd 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -780,7 +780,7 @@ static int longhaul_setup_southbridge(void) static int __init longhaul_cpu_init(struct cpufreq_policy *policy) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); char *cpuname=NULL; int ret; u32 lo, hi; @@ -959,7 +959,7 @@ static struct cpufreq_driver longhaul_driver = { static int __init longhaul_init(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) return -ENODEV; diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index b2689514295a..af4a867a097c 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -172,7 +172,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, u32 save_lo, save_hi; u32 eax, ebx, ecx, edx; u32 try_hi; - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); if (!low_freq || !high_freq) return -EINVAL; @@ -298,7 +298,7 @@ static struct cpufreq_driver longrun_driver = { */ static int __init longrun_init(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); if (c->x86_vendor != X86_VENDOR_TRANSMETA || !cpu_has(c, X86_FEATURE_LONGRUN)) diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 793eae854f4f..14791ec55cfd 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -195,7 +195,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) { - struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; + struct cpuinfo_x86 *c = &cpu_data(policy->cpu); int cpuid = 0; unsigned int i; @@ -279,7 +279,7 @@ static struct cpufreq_driver p4clockmod_driver = { static int __init cpufreq_p4_init(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); int ret; /* diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index 6d0285339317..eb9b62b0830c 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -1,6 +1,6 @@ /* * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) - * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski. + * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, Dominik Brodowski. * * Licensed under the terms of the GNU GPL License version 2. * @@ -215,7 +215,7 @@ static struct cpufreq_driver powernow_k6_driver = { */ static int __init powernow_k6_init(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) || ((c->x86_model != 12) && (c->x86_model != 13))) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index f3686a5f2308..b5a9863d6cdc 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -114,7 +114,7 @@ static int check_fsb(unsigned int fsbspeed) static int check_powernow(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); unsigned int maxei, eax, ebx, ecx, edx; if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 !=6)) { diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c06ac680c9ca..9c36a53676b7 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -168,7 +168,7 @@ static void count_off_irt(struct powernow_k8_data *data) return; } -/* the voltage stabalization time */ +/* the voltage stabilization time */ static void count_off_vst(struct powernow_k8_data *data) { udelay(data->vstable * VST_UNITS_20US); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index b06c812208ca..7c4f6e0faed4 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -148,10 +148,10 @@ struct powernow_k8_data { #define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ #define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ -#define VST_UNITS_20US 20 /* Voltage Stabalization Time is in units of 20us */ +#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */ /* - * Most values of interest are enocoded in a single field of the _PSS + * Most values of interest are encoded in a single field of the _PSS * entries: the "control" value. */ diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c index d9f3e90a7ae0..42da9bd677d6 100644 --- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c +++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c @@ -102,7 +102,7 @@ static int sc520_freq_target (struct cpufreq_policy *policy, static int sc520_freq_cpu_init(struct cpufreq_policy *policy) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); int result; /* capability check */ @@ -151,7 +151,7 @@ static struct cpufreq_driver sc520_freq_driver = { static int __init sc520_freq_init(void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); int err; /* Test if we have the right hardware */ diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 811d47438546..3031f1196192 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -230,7 +230,7 @@ static struct cpu_model models[] = static int centrino_cpu_init_table(struct cpufreq_policy *policy) { - struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; + struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); struct cpu_model *model; for(model = models; model->cpu_id != NULL; model++) @@ -340,7 +340,7 @@ static unsigned int get_cur_freq(unsigned int cpu) static int centrino_cpu_init(struct cpufreq_policy *policy) { - struct cpuinfo_x86 *cpu = &cpu_data[policy->cpu]; + struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); unsigned freq; unsigned l, h; int ret; @@ -612,7 +612,7 @@ static struct cpufreq_driver centrino_driver = { */ static int __init centrino_init(void) { - struct cpuinfo_x86 *cpu = cpu_data; + struct cpuinfo_x86 *cpu = &cpu_data(0); if (!cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index b1acc8ce3167..76c3ab0da468 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -228,7 +228,7 @@ EXPORT_SYMBOL_GPL(speedstep_get_processor_frequency); unsigned int speedstep_detect_processor (void) { - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(0); u32 ebx, msr_lo, msr_hi; dprintk("x86: %x, model: %x\n", c->x86, c->x86_model); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 122d2d75aa9f..88d66fb8411d 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -93,7 +93,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ ccr5 = getCx86(CX86_CCR5); if (ccr5 & 2) setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */ @@ -115,9 +115,9 @@ static void __cpuinit set_cx86_reorder(void) printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n"); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - /* Load/Store Serialize to mem access disable (=reorder it) */ + /* Load/Store Serialize to mem access disable (=reorder it) */ setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); /* set load/store serialize from 1GB to 4GB */ ccr3 |= 0xe0; @@ -146,7 +146,7 @@ static void __cpuinit set_cx86_inc(void) printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n"); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* PCR1 -- Performance Control */ /* Incrementor on, whatever that is */ setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); @@ -256,7 +256,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) u32 vendor, device; /* It isn't really a PCI quirk directly, but the cure is the same. The MediaGX has deep magic SMM stuff that handles the - SB emulation. It thows away the fifo on disable_dma() which + SB emulation. It throws away the fifo on disable_dma() which is wrong and ruins the audio. Bug2: VSA1 has a wrap bug so that using maximum sized DMA diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1826395ebeeb..9921b01fe199 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -295,7 +295,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; #ifdef CONFIG_X86_HT - unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data); + unsigned int cpu = c->cpu_index; #endif if (c->cpuid_level > 3) { @@ -417,14 +417,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; #ifdef CONFIG_X86_HT - cpu_llc_id[cpu] = l2_id; + per_cpu(cpu_llc_id, cpu) = l2_id; #endif } if (new_l3) { l3 = new_l3; #ifdef CONFIG_X86_HT - cpu_llc_id[cpu] = l3_id; + per_cpu(cpu_llc_id, cpu) = l3_id; #endif } @@ -459,7 +459,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) struct _cpuid4_info *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; int index_msb, i; - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(cpu); this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; @@ -470,8 +470,8 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) index_msb = get_count_order(num_threads_sharing); for_each_online_cpu(i) { - if (c[i].apicid >> index_msb == - c[cpu].apicid >> index_msb) { + if (cpu_data(i).apicid >> index_msb == + c->apicid >> index_msb) { cpu_set(i, this_leaf->shared_cpu_map); if (i != cpu && cpuid4_info[i]) { sibling_leaf = CPUID4_INFO_IDX(i, index); @@ -499,6 +499,11 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) { static void free_cache_attributes(unsigned int cpu) { + int i; + + for (i = 0; i < num_cache_leaves; i++) + cache_remove_shared_cpu_map(cpu, i); + kfree(cpuid4_info[cpu]); cpuid4_info[cpu] = NULL; } @@ -506,8 +511,8 @@ static void free_cache_attributes(unsigned int cpu) static int __cpuinit detect_cache_attributes(unsigned int cpu) { struct _cpuid4_info *this_leaf; - unsigned long j; - int retval; + unsigned long j; + int retval; cpumask_t oldmask; if (num_cache_leaves == 0) @@ -524,19 +529,26 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) goto out; /* Do cpuid and store the results */ - retval = 0; for (j = 0; j < num_cache_leaves; j++) { this_leaf = CPUID4_INFO_IDX(cpu, j); retval = cpuid4_cache_lookup(j, this_leaf); - if (unlikely(retval < 0)) + if (unlikely(retval < 0)) { + int i; + + for (i = 0; i < j; i++) + cache_remove_shared_cpu_map(cpu, i); break; + } cache_shared_cpu_map_setup(cpu, j); } set_cpus_allowed(current, oldmask); out: - if (retval) - free_cache_attributes(cpu); + if (retval) { + kfree(cpuid4_info[cpu]); + cpuid4_info[cpu] = NULL; + } + return retval; } @@ -669,7 +681,7 @@ static struct kobj_type ktype_percpu_entry = { .sysfs_ops = &sysfs_ops, }; -static void cpuid4_cache_sysfs_exit(unsigned int cpu) +static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) { kfree(cache_kobject[cpu]); kfree(index_kobject[cpu]); @@ -680,13 +692,14 @@ static void cpuid4_cache_sysfs_exit(unsigned int cpu) static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) { + int err; if (num_cache_leaves == 0) return -ENOENT; - detect_cache_attributes(cpu); - if (cpuid4_info[cpu] == NULL) - return -ENOENT; + err = detect_cache_attributes(cpu); + if (err) + return err; /* Allocate all required memory */ cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL); @@ -705,13 +718,15 @@ err_out: return -ENOMEM; } +static cpumask_t cache_dev_map = CPU_MASK_NONE; + /* Add/Remove cache interface for CPU device */ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) { unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; - int retval = 0; + int retval; retval = cpuid4_cache_sysfs_init(cpu); if (unlikely(retval < 0)) @@ -721,6 +736,10 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) kobject_set_name(cache_kobject[cpu], "%s", "cache"); cache_kobject[cpu]->ktype = &ktype_percpu_entry; retval = kobject_register(cache_kobject[cpu]); + if (retval < 0) { + cpuid4_cache_sysfs_exit(cpu); + return retval; + } for (i = 0; i < num_cache_leaves; i++) { this_object = INDEX_KOBJECT_PTR(cpu,i); @@ -740,6 +759,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) break; } } + if (!retval) + cpu_set(cpu, cache_dev_map); + return retval; } @@ -750,13 +772,14 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) if (cpuid4_info[cpu] == NULL) return; - for (i = 0; i < num_cache_leaves; i++) { - cache_remove_shared_cpu_map(cpu, i); + if (!cpu_isset(cpu, cache_dev_map)) + return; + cpu_clear(cpu, cache_dev_map); + + for (i = 0; i < num_cache_leaves; i++) kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); - } kobject_unregister(cache_kobject[cpu]); cpuid4_cache_sysfs_exit(cpu); - return; } static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, @@ -781,7 +804,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { - .notifier_call = cacheinfo_cpu_callback, + .notifier_call = cacheinfo_cpu_callback, }; static int __cpuinit cache_sysfs_init(void) @@ -791,14 +814,15 @@ static int __cpuinit cache_sysfs_init(void) if (num_cache_leaves == 0) return 0; - register_hotcpu_notifier(&cacheinfo_cpu_notifier); - for_each_online_cpu(i) { - struct sys_device *sys_dev = get_cpu_sysdev((unsigned int)i); + int err; + struct sys_device *sys_dev = get_cpu_sysdev(i); - cache_add_dev(sys_dev); + err = cache_add_dev(sys_dev); + if (err) + return err; } - + register_hotcpu_notifier(&cacheinfo_cpu_notifier); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 494d320d909b..24885be5c48c 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -131,17 +131,19 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, { unsigned int cpu = (unsigned long)hcpu; struct sys_device *sys_dev; - int err; + int err = 0; sys_dev = get_cpu_sysdev(cpu); switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(sys_dev); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: mutex_lock(&therm_cpu_lock); @@ -149,7 +151,7 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, mutex_unlock(&therm_cpu_lock); break; } - return NOTIFY_OK; + return err ? NOTIFY_BAD : NOTIFY_OK; } static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 2287d4863a8a..9964be3de2b7 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -147,10 +147,10 @@ static void prepare_set(void) write_cr0(cr0); wbinvd(); - /* Cyrix ARRs - everything else were excluded at the top */ + /* Cyrix ARRs - everything else was excluded at the top */ ccr3 = getCx86(CX86_CCR3); - /* Cyrix ARRs - everything else were excluded at the top */ + /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 56f64e34829f..992f08dfbb6c 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -182,7 +182,7 @@ static inline void k8_enable_fixed_iorrs(void) /** * Checks and updates an fixed-range MTRR if it differs from the value it - * should have. If K8 extenstions are wanted, update the K8 SYSCFG MSR also. + * should have. If K8 extentions are wanted, update the K8 SYSCFG MSR also. * see AMD publication no. 24593, chapter 7.8.1, page 233 for more information * \param msr MSR address of the MTTR which should be checked and updated * \param changed pointer which indicates whether the MTRR needed to be changed diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 5e4be30ff903..9abbdf7562c5 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -748,7 +748,7 @@ static int __init mtrr_init_finialize(void) if (use_intel()) mtrr_state_warn(); else { - /* The CPUs haven't MTRR and seemes not support SMP. They have + /* The CPUs haven't MTRR and seem to not support SMP. They have * specific drivers, we use a tricky method to support * suspend/resume for them. * TBD: is there any system with such CPU which supports diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 54cdbf1a40f1..c02541e6e653 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -120,7 +120,9 @@ int reserve_perfctr_nmi(unsigned int msr) unsigned int counter; counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; if (!test_and_set_bit(counter, perfctr_nmi_owner)) return 1; @@ -132,7 +134,9 @@ void release_perfctr_nmi(unsigned int msr) unsigned int counter; counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; clear_bit(counter, perfctr_nmi_owner); } @@ -142,7 +146,9 @@ int reserve_evntsel_nmi(unsigned int msr) unsigned int counter; counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; if (!test_and_set_bit(counter, evntsel_nmi_owner)) return 1; @@ -154,7 +160,9 @@ void release_evntsel_nmi(unsigned int msr) unsigned int counter; counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; clear_bit(counter, evntsel_nmi_owner); } diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 879a0f789b1e..2d42b414b777 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -85,12 +85,13 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* nothing */ }; struct cpuinfo_x86 *c = v; - int i, n = c - cpu_data; + int i, n = 0; int fpu_exception; #ifdef CONFIG_SMP if (!cpu_online(n)) return 0; + n = c->cpu_index; #endif seq_printf(m, "processor\t: %d\n" "vendor_id\t: %s\n" @@ -175,11 +176,15 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < NR_CPUS ? cpu_data + *pos : NULL; + if (*pos == 0) /* just in case, cpu 0 is not the first */ + *pos = first_cpu(cpu_possible_map); + if ((*pos) < NR_CPUS && cpu_possible(*pos)) + return &cpu_data(*pos); + return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - ++*pos; + *pos = next_cpu(*pos, cpu_possible_map); return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 70dcf912d9fb..05c9936a16cc 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -114,7 +114,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, static int cpuid_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + struct cpuinfo_x86 *c = &cpu_data(cpu); if (cpu >= NR_CPUS || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ @@ -134,15 +134,18 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static int __cpuinit cpuid_device_create(int i) +static __cpuinit int cpuid_device_create(int cpu) { - int err = 0; struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i); - if (IS_ERR(dev)) - err = PTR_ERR(dev); - return err; + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), + "cpu%d", cpu); + return IS_ERR(dev) ? PTR_ERR(dev) : 0; +} + +static void cpuid_device_destroy(int cpu) +{ + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); } static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, @@ -150,18 +153,21 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + int err = 0; switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - cpuid_device_create(cpu); + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = cpuid_device_create(cpu); break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + cpuid_device_destroy(cpu); break; } - return NOTIFY_OK; + return err ? NOTIFY_BAD : NOTIFY_OK; } static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = @@ -198,7 +204,7 @@ static int __init cpuid_init(void) out_class: i = 0; for_each_online_cpu(i) { - device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); + cpuid_device_destroy(i); } class_destroy(cpuid_class); out_chrdev: @@ -212,7 +218,7 @@ static void __exit cpuid_exit(void) int cpu = 0; for_each_online_cpu(cpu) - device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + cpuid_device_destroy(cpu); class_destroy(cpuid_class); unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 32e75d0731a9..72d0c56c1b48 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -47,6 +47,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!kdump_buf_page) { printk(KERN_WARNING "Kdump: Kdump buffer page not" " allocated\n"); + kunmap_atomic(vaddr, KM_PTE0); return -EFAULT; } copy_page(kdump_buf_page, vaddr); diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 3c86b979a40a..18f500d185a2 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -51,6 +51,13 @@ struct resource code_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + static struct resource system_rom_resource = { .name = "System ROM", .start = 0xf0000, @@ -254,7 +261,9 @@ static void __init probe_roms(void) * and also for regions reported as reserved by the e820. */ static void __init -legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +legacy_init_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; @@ -287,8 +296,10 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat */ request_resource(res, code_resource); request_resource(res, data_resource); + request_resource(res, bss_resource); #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); #endif } } @@ -306,9 +317,11 @@ static int __init request_standard_resources(void) printk("Setting up standard PCI resources\n"); if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, &data_resource); + efi_initialize_iomem_resources(&code_resource, + &data_resource, &bss_resource); else - legacy_init_iomem_resources(&code_resource, &data_resource); + legacy_init_iomem_resources(&code_resource, + &data_resource, &bss_resource); /* EFI systems may still have VGA */ request_resource(&iomem_resource, &video_ram_resource); @@ -705,7 +718,7 @@ void __init e820_register_memory(void) int i; /* - * Search for the bigest gap in the low 32 bits of the e820 + * Search for the biggest gap in the low 32 bits of the e820 * memory space. */ last = 0x100000000ull; diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index e422b8159f69..04698e0b056c 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -47,7 +47,7 @@ unsigned long end_pfn_map; */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -extern struct resource code_resource, data_resource; +extern struct resource code_resource, data_resource, bss_resource; /* Check for some hardcoded bad areas that early boot is not allowed to touch */ static inline int bad_addr(unsigned long *addrp, unsigned long size) @@ -225,8 +225,10 @@ void __init e820_reserve_resources(void) */ request_resource(res, &code_resource); request_resource(res, &data_resource); + request_resource(res, &bss_resource); #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); #endif } } @@ -728,3 +730,22 @@ __init void e820_setup_gap(void) printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", pci_mem_start, gapstart, gapsize); } + +int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) +{ + int i; + + if (slot < 0 || slot >= e820.nr_map) + return -1; + for (i = slot; i < e820.nr_map; i++) { + if (e820.map[i].type != E820_RAM) + continue; + break; + } + if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) + return -1; + *addr = e820.map[i].addr; + *size = min_t(u64, e820.map[i].size + e820.map[i].addr, + max_pfn << PAGE_SHIFT) - *addr; + return i + 1; +} diff --git a/arch/x86/kernel/early-quirks_64.c b/arch/x86/kernel/early-quirks.c index 13aa4fd728f3..dc34acbd54aa 100644 --- a/arch/x86/kernel/early-quirks_64.c +++ b/arch/x86/kernel/early-quirks.c @@ -13,9 +13,13 @@ #include <linux/acpi.h> #include <linux/pci_ids.h> #include <asm/pci-direct.h> -#include <asm/proto.h> -#include <asm/iommu.h> #include <asm/dma.h> +#include <asm/io_apic.h> +#include <asm/apic.h> + +#ifdef CONFIG_IOMMU +#include <asm/iommu.h> +#endif static void __init via_bugs(void) { @@ -23,7 +27,8 @@ static void __init via_bugs(void) if ((end_pfn > MAX_DMA32_PFN || force_iommu) && !iommu_aperture_allowed) { printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); + "Looks like a VIA chipset. Disabling IOMMU." + " Override with iommu=allowed\n"); iommu_aperture_disabled = 1; } #endif @@ -40,6 +45,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header) static void __init nvidia_bugs(void) { #ifdef CONFIG_ACPI +#ifdef CONFIG_X86_IO_APIC /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. @@ -59,17 +65,20 @@ static void __init nvidia_bugs(void) "try acpi_use_timer_override\n"); } #endif +#endif /* RED-PEN skip them on mptables too? */ } static void __init ati_bugs(void) { +#ifdef CONFIG_X86_IO_APIC if (timer_over_8254 == 1) { timer_over_8254 = 0; printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); + "ATI board detected. Disabling timer routing over 8254.\n"); } +#endif } struct chipset { @@ -104,7 +113,7 @@ void __init early_quirks(void) if (class == 0xffffffff) break; - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) continue; vendor = read_pci_config(num, slot, func, diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index b42558c48e9d..e2be78f49399 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -603,7 +603,8 @@ void __init efi_enter_virtual_mode(void) void __init efi_initialize_iomem_resources(struct resource *code_resource, - struct resource *data_resource) + struct resource *data_resource, + struct resource *bss_resource) { struct resource *res; efi_memory_desc_t *md; @@ -675,6 +676,7 @@ efi_initialize_iomem_resources(struct resource *code_resource, if (md->type == EFI_CONVENTIONAL_MEMORY) { request_resource(res, code_resource); request_resource(res, data_resource); + request_resource(res, bss_resource); #ifdef CONFIG_KEXEC request_resource(res, &crashk_res); #endif diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 4ae03e3e8294..ce703e21c912 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -24,10 +24,19 @@ #include <acpi/acpi_bus.h> #endif -/* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly +/* + * which logical CPU number maps to which CPU (physical APIC ID) + * + * The following static array is used during kernel startup + * and the x86_cpu_to_apicid_ptr contains the address of the + * array during this time. Is it zeroed when the per_cpu + * data area is removed. + */ +u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(x86_cpu_to_apicid); +void *x86_cpu_to_apicid_ptr; +DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); struct genapic __read_mostly *genapic = &apic_flat; diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 91c7526768ee..07352b74bda6 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -172,7 +172,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) */ cpu = first_cpu(cpumask); if ((unsigned)cpu < NR_CPUS) - return x86_cpu_to_apicid[cpu]; + return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index a7eee0a4751d..6b3469311e42 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -58,7 +58,7 @@ void __init x86_64_start_kernel(char * real_mode_data) for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); - asm volatile("lidt %0" :: "m" (idt_descr)); + load_idt((const struct desc_ptr *)&idt_descr); early_printk("Kernel alive\n"); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 39677965e161..00b1c2c56454 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -79,22 +79,30 @@ INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_ */ .section .text.head,"ax",@progbits ENTRY(startup_32) + /* check to see if KEEP_SEGMENTS flag is meaningful */ + cmpw $0x207, BP_version(%esi) + jb 1f + + /* test KEEP_SEGMENTS flag to see if the bootloader is asking + us to not reload segments */ + testb $(1<<6), BP_loadflags(%esi) + jnz 2f /* * Set segments to known values. */ - cld - lgdt boot_gdt_descr - __PAGE_OFFSET +1: lgdt boot_gdt_descr - __PAGE_OFFSET movl $(__BOOT_DS),%eax movl %eax,%ds movl %eax,%es movl %eax,%fs movl %eax,%gs +2: /* * Clear BSS first so that there are no surprises... - * No need to cld as DF is already clear from cld above... */ + cld xorl %eax,%eax movl $__bss_start - __PAGE_OFFSET,%edi movl $__bss_stop - __PAGE_OFFSET,%ecx @@ -128,6 +136,35 @@ ENTRY(startup_32) movsl 1: +#ifdef CONFIG_PARAVIRT + cmpw $0x207, (boot_params + BP_version - __PAGE_OFFSET) + jb default_entry + + /* Paravirt-compatible boot parameters. Look to see what architecture + we're booting under. */ + movl (boot_params + BP_hardware_subarch - __PAGE_OFFSET), %eax + cmpl $num_subarch_entries, %eax + jae bad_subarch + + movl subarch_entries - __PAGE_OFFSET(,%eax,4), %eax + subl $__PAGE_OFFSET, %eax + jmp *%eax + +bad_subarch: +WEAK(lguest_entry) +WEAK(xen_entry) + /* Unknown implementation; there's really + nothing we can do at this point. */ + ud2a +.data +subarch_entries: + .long default_entry /* normal x86/PC */ + .long lguest_entry /* lguest hypervisor */ + .long xen_entry /* Xen hypervisor */ +num_subarch_entries = (. - subarch_entries) / 4 +.previous +#endif /* CONFIG_PARAVIRT */ + /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond _end. The variable @@ -140,6 +177,7 @@ ENTRY(startup_32) */ page_pde_offset = (__PAGE_OFFSET >> 20); +default_entry: movl $(pg0 - __PAGE_OFFSET), %edi movl $(swapper_pg_dir - __PAGE_OFFSET), %edx movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index f8367074da0d..53303f2e5475 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -69,12 +69,15 @@ static inline void hpet_clear_mapping(void) * HPET command line enable / disable */ static int boot_hpet_disable; +int hpet_force_user; static int __init hpet_setup(char* str) { if (str) { if (!strncmp("disable", str, 7)) boot_hpet_disable = 1; + if (!strncmp("force", str, 5)) + hpet_force_user = 1; } return 1; } @@ -350,7 +353,7 @@ static int hpet_clocksource_register(void) * * hpet period is in femto seconds per cycle * so we need to convert this to ns/cyc units - * aproximated by mult/2^shift + * approximated by mult/2^shift * * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift * fsec/cyc * 1ns/1000000fsec * 2^shift = mult diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cc8841ca2c6..a42c80745325 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -86,7 +86,7 @@ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) * On UP the PIT can serve all of the possible timer functions. On SMP systems * it can be solely used for the global tick. * - * The profiling and update capabilites are switched off once the local apic is + * The profiling and update capabilities are switched off once the local apic is * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - * !using_apic_timer decisions in do_timer_interrupt_hook() */ diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index d34a10cc13a7..f634fc715c99 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -403,7 +403,8 @@ void __init native_init_IRQ(void) int vector = FIRST_EXTERNAL_VECTOR + i; if (i >= NR_IRQS) break; - if (vector != SYSCALL_VECTOR) + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) set_intr_gate(vector, interrupt[i]); } diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task.c index d26fc063a760..468c9c437842 100644 --- a/arch/x86/kernel/init_task_32.c +++ b/arch/x86/kernel/init_task.c @@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); - EXPORT_SYMBOL(init_mm); /* @@ -25,7 +24,7 @@ EXPORT_SYMBOL(init_mm); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union +union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = { INIT_THREAD_INFO(init_task) }; @@ -35,12 +34,14 @@ union thread_union init_thread_union * All other task structs will be allocated on slabs in fork.c */ struct task_struct init_task = INIT_TASK(init_task); - EXPORT_SYMBOL(init_task); /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. - */ + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; diff --git a/arch/x86/kernel/init_task_64.c b/arch/x86/kernel/init_task_64.c deleted file mode 100644 index 4ff33d4f8551..000000000000 --- a/arch/x86/kernel/init_task_64.c +++ /dev/null @@ -1,54 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - -/* - * Initial task structure. - * - * We need to make sure that this is 8192-byte aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_task); -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data.cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - -/* Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 5f10c7189534..f35c6eb33da9 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -584,7 +584,7 @@ tryanotherirq: imbalance = move_this_load; - /* For physical_balance case, we accumlated both load + /* For physical_balance case, we accumulated both load * values in the one of the siblings cpu_irq[], * to use the same code for physical and logical processors * as much as possible. @@ -1198,7 +1198,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 } static int __assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - int vector, offset, i; + int vector, offset; BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); @@ -1215,11 +1215,8 @@ next: } if (vector == current_vector) return -ENOSPC; - if (vector == SYSCALL_VECTOR) + if (test_and_set_bit(vector, used_vectors)) goto next; - for (i = 0; i < NR_IRQ_VECTORS; i++) - if (irq_vector[i] == vector) - goto next; current_vector = vector; current_offset = offset; @@ -2295,6 +2292,12 @@ static inline void __init check_timer(void) void __init setup_IO_APIC(void) { + int i; + + /* Reserve all the system vectors. */ + for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) + set_bit(i, used_vectors); + enable_IO_APIC(); if (acpi_ioapic) @@ -2472,7 +2475,7 @@ void destroy_irq(unsigned int irq) } /* - * MSI mesage composition + * MSI message composition */ #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 1c2c7bf6a9d3..953328b55a30 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -31,6 +31,7 @@ #include <linux/sysdev.h> #include <linux/msi.h> #include <linux/htirq.h> +#include <linux/dmar.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif @@ -1770,7 +1771,7 @@ __setup("no_timer_check", notimercheck); /* * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * IRQs that are handled by the PIC in the MPS IOAPIC case. * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. * Linux doesn't really care, as it's not actually used * for any interrupt handling anyway. @@ -1921,7 +1922,7 @@ void destroy_irq(unsigned int irq) } /* - * MSI mesage composition + * MSI message composition */ #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) @@ -2031,8 +2032,64 @@ void arch_teardown_msi_irq(unsigned int irq) destroy_irq(irq); } -#endif /* CONFIG_PCI_MSI */ +#ifdef CONFIG_DMAR +#ifdef CONFIG_SMP +static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + irq_desc[irq].affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .unmask = dmar_msi_unmask, + .mask = dmar_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = dmar_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif +#endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support */ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8459ca64bc2f..11b935f4f886 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -149,28 +149,6 @@ NORET_TYPE void machine_kexec(struct kimage *image) image->start, cpu_has_pae); } -/* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never sets it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ -static int __init parse_crashkernel(char *arg) -{ - unsigned long size, base; - size = memparse(arg, &arg); - if (*arg == '@') { - base = memparse(arg+1, &arg); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - return 0; -} -early_param("crashkernel", parse_crashkernel); - void arch_crash_save_vmcoreinfo(void) { #ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 7450b69710b5..0d8577f05422 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -231,33 +231,6 @@ NORET_TYPE void machine_kexec(struct kimage *image) image->start); } -/* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ -static int __init setup_crashkernel(char *arg) -{ - unsigned long size, base; - char *p; - if (!arg) - return -EINVAL; - size = memparse(arg, &p); - if (arg == p) - return -EINVAL; - if (*p == '@') { - base = memparse(p+1, &p); - /* FIXME: Do I want a sanity check to validate the - * memory range? Yes you do, but it's too early for - * e820 -AK */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - return 0; -} -early_param("crashkernel", setup_crashkernel); - void arch_crash_save_vmcoreinfo(void) { #ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c index 8ca8f8648969..07bbfe7aa7f7 100644 --- a/arch/x86/kernel/mce_64.c +++ b/arch/x86/kernel/mce_64.c @@ -320,7 +320,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) #ifdef CONFIG_X86_MCE_INTEL /*** * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog - * @cpu: The CPU on which the event occured. + * @cpu: The CPU on which the event occurred. * @status: Event status information * * This function should be called by the thermal interrupt after the @@ -688,7 +688,7 @@ static int __init mcheck_disable(char *str) return 1; } -/* mce=off disables machine check. Note you can reenable it later +/* mce=off disables machine check. Note you can re-enable it later using sysfs. mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default on AMD. @@ -799,19 +799,33 @@ static __cpuinit int mce_create_device(unsigned int cpu) { int err; int i; - if (!mce_available(&cpu_data[cpu])) + + if (!mce_available(&cpu_data(cpu))) return -EIO; + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); per_cpu(device_mce,cpu).id = cpu; per_cpu(device_mce,cpu).cls = &mce_sysclass; err = sysdev_register(&per_cpu(device_mce,cpu)); + if (err) + return err; + + for (i = 0; mce_attributes[i]; i++) { + err = sysdev_create_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); + if (err) + goto error; + } - if (!err) { - for (i = 0; mce_attributes[i]; i++) - sysdev_create_file(&per_cpu(device_mce,cpu), - mce_attributes[i]); + return 0; +error: + while (i--) { + sysdev_remove_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); } + sysdev_unregister(&per_cpu(device_mce,cpu)); + return err; } @@ -823,7 +837,6 @@ static void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ @@ -831,18 +844,21 @@ static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + int err = 0; switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = mce_create_device(cpu); break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: mce_remove_device(cpu); break; } - return NOTIFY_OK; + return err ? NOTIFY_BAD : NOTIFY_OK; } static struct notifier_block mce_cpu_notifier = { @@ -857,9 +873,13 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; err = sysdev_class_register(&mce_sysclass); + if (err) + return err; for_each_online_cpu(i) { - mce_create_device(i); + err = mce_create_device(i); + if (err) + return err; } register_hotcpu_notifier(&mce_cpu_notifier); diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c index 0d2afd96aca4..752fb16a817d 100644 --- a/arch/x86/kernel/mce_amd_64.c +++ b/arch/x86/kernel/mce_amd_64.c @@ -472,11 +472,11 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP - if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ + if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = first_cpu(per_cpu(cpu_core_map, cpu)); /* first core not up yet */ - if (cpu_data[i].cpu_core_id) + if (cpu_data(i).cpu_core_id) goto out; /* already linked */ diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 09cf78110358..09c315214a5e 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -132,7 +132,7 @@ static struct ucode_cpu_info { static void collect_cpu_info(int cpu_num) { - struct cpuinfo_x86 *c = cpu_data + cpu_num; + struct cpuinfo_x86 *c = &cpu_data(cpu_num); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; unsigned int val[2]; @@ -522,7 +522,7 @@ static struct platform_device *microcode_pdev; static int cpu_request_microcode(int cpu) { char name[30]; - struct cpuinfo_x86 *c = cpu_data + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; void *buf; unsigned long size; @@ -570,7 +570,7 @@ static int cpu_request_microcode(int cpu) static int apply_microcode_check_cpu(int cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; cpumask_t old; unsigned int val[2]; diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c index 13abb4ebfb79..7a05a7f6099a 100644 --- a/arch/x86/kernel/mpparse_32.c +++ b/arch/x86/kernel/mpparse_32.c @@ -1001,7 +1001,7 @@ void __init mp_config_acpi_legacy_irqs (void) /* * Use the default configuration for the IRQs 0-15. Unless - * overriden by (MADT) interrupt source override entries. + * overridden by (MADT) interrupt source override entries. */ for (i = 0; i < 16; i++) { int idx; diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c index 8bf0ca03ac8e..ef4aab123581 100644 --- a/arch/x86/kernel/mpparse_64.c +++ b/arch/x86/kernel/mpparse_64.c @@ -57,6 +57,8 @@ unsigned long mp_lapic_addr = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_id = -1U; +EXPORT_SYMBOL(boot_cpu_id); + /* Internal processor count */ unsigned int num_processors __cpuinitdata = 0; @@ -86,7 +88,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -static void __cpuinit MP_processor_info (struct mpc_config_processor *m) +static void __cpuinit MP_processor_info(struct mpc_config_processor *m) { int cpu; cpumask_t tmp_map; @@ -123,7 +125,18 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) cpu = 0; } bios_cpu_apicid[cpu] = m->mpc_apicid; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; + /* + * We get called early in the the start_kernel initialization + * process when the per_cpu data area is not yet setup, so we + * use a static array that is removed after the per_cpu data + * area is created. + */ + if (x86_cpu_to_apicid_ptr) { + u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; + x86_cpu_to_apicid[cpu] = m->mpc_apicid; + } else { + per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; + } cpu_set(cpu, cpu_possible_map); cpu_set(cpu, cpu_present_map); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index df85c9c13601..ee6eba4ecfea 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -112,7 +112,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf, static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + struct cpuinfo_x86 *c = &cpu_data(cpu); if (cpu >= NR_CPUS || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ @@ -133,37 +133,42 @@ static const struct file_operations msr_fops = { .open = msr_open, }; -static int __cpuinit msr_device_create(int i) +static int __cpuinit msr_device_create(int cpu) { - int err = 0; struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i); - if (IS_ERR(dev)) - err = PTR_ERR(dev); - return err; + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), + "msr%d", cpu); + return IS_ERR(dev) ? PTR_ERR(dev) : 0; +} + +static void msr_device_destroy(int cpu) +{ + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); } static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + int err = 0; switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - msr_device_create(cpu); + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = msr_device_create(cpu); break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + msr_device_destroy(cpu); break; } - return NOTIFY_OK; + return err ? NOTIFY_BAD : NOTIFY_OK; } -static struct notifier_block __cpuinitdata msr_class_cpu_notifier = -{ +static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; @@ -196,7 +201,7 @@ static int __init msr_init(void) out_class: i = 0; for_each_online_cpu(i) - device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); + msr_device_destroy(i); class_destroy(msr_class); out_chrdev: unregister_chrdev(MSR_MAJOR, "cpu/msr"); @@ -208,7 +213,7 @@ static void __exit msr_exit(void) { int cpu = 0; for_each_online_cpu(cpu) - device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + msr_device_destroy(cpu); class_destroy(msr_class); unregister_chrdev(MSR_MAJOR, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 5098f58063a5..1a20fe31338b 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -411,8 +411,10 @@ static int calgary_nontranslate_map_sg(struct device* dev, int i; for_each_sg(sg, s, nelems, i) { - BUG_ON(!s->page); - s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + struct page *p = sg_page(s); + + BUG_ON(!p); + s->dma_address = virt_to_bus(sg_virt(s)); s->dma_length = s->length; } return nelems; @@ -432,9 +434,9 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, return calgary_nontranslate_map_sg(dev, sg, nelems, direction); for_each_sg(sg, s, nelems, i) { - BUG_ON(!s->page); + BUG_ON(!sg_page(s)); - vaddr = (unsigned long)page_address(s->page) + s->offset; + vaddr = (unsigned long) sg_virt(s); npages = num_dma_pages(vaddr, s->length); entry = iommu_range_alloc(tbl, npages); diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c index b2b42bdb0a15..393e2725a6e3 100644 --- a/arch/x86/kernel/pci-dma_64.c +++ b/arch/x86/kernel/pci-dma_64.c @@ -7,11 +7,12 @@ #include <linux/string.h> #include <linux/pci.h> #include <linux/module.h> +#include <linux/dmar.h> #include <asm/io.h> #include <asm/iommu.h> #include <asm/calgary.h> -int iommu_merge __read_mostly = 0; +int iommu_merge __read_mostly = 1; EXPORT_SYMBOL(iommu_merge); dma_addr_t bad_dma_address __read_mostly; @@ -305,6 +306,8 @@ void __init pci_iommu_alloc(void) detect_calgary(); #endif + detect_intel_iommu(); + #ifdef CONFIG_SWIOTLB pci_swiotlb_init(); #endif @@ -316,6 +319,8 @@ static int __init pci_iommu_init(void) calgary_iommu_init(); #endif + intel_iommu_init(); + #ifdef CONFIG_IOMMU gart_iommu_init(); #endif diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 5cdfab65e93f..c56e9ee64964 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -302,7 +302,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, #endif for_each_sg(sg, s, nents, i) { - unsigned long addr = page_to_phys(s->page) + s->offset; + unsigned long addr = sg_phys(s); if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir); if (addr == bad_dma_address) { @@ -397,7 +397,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, start_sg = sgmap = sg; ps = NULL; /* shut up gcc */ for_each_sg(sg, s, nents, i) { - dma_addr_t addr = page_to_phys(s->page) + s->offset; + dma_addr_t addr = sg_phys(s); s->dma_address = addr; BUG_ON(s->length == 0); diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c index e85d4360360c..faf70bdca335 100644 --- a/arch/x86/kernel/pci-nommu_64.c +++ b/arch/x86/kernel/pci-nommu_64.c @@ -62,8 +62,8 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int i; for_each_sg(sg, s, nents, i) { - BUG_ON(!s->page); - s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + BUG_ON(!sg_page(s)); + s->dma_address = virt_to_bus(sg_virt(s)); if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) return 0; s->dma_length = s->length; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 097aeafce5ff..7b899584d290 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -295,34 +295,52 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); -void show_regs(struct pt_regs * regs) +void __show_registers(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; + unsigned long esp; + unsigned short ss, gs; + + if (user_mode_vm(regs)) { + esp = regs->esp; + ss = regs->xss & 0xffff; + savesegment(gs, gs); + } else { + esp = (unsigned long) (®s->esp); + savesegment(ss, ss); + savesegment(gs, gs); + } printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); + printk("Pid: %d, comm: %s %s (%s %.*s)\n", + task_pid_nr(current), current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + 0xffff & regs->xcs, regs->eip, regs->eflags, + smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); - if (user_mode_vm(regs)) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); - printk(" EFLAGS: %08lx %s (%s %.*s)\n", - regs->eflags, print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax,regs->ebx,regs->ecx,regs->edx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx", - regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x FS: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); + regs->eax, regs->ebx, regs->ecx, regs->edx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + regs->esi, regs->edi, regs->ebp, esp); + printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, + regs->xfs & 0xffff, gs, ss); + + if (!all) + return; cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -330,10 +348,16 @@ void show_regs(struct pt_regs * regs) get_debugreg(d3, 3); printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); + get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", d6, d7); + printk("DR6: %08lx DR7: %08lx\n", + d6, d7); +} +void show_regs(struct pt_regs *regs) +{ + __show_registers(regs, 1); show_trace(NULL, regs, ®s->esp); } diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c index 99102ec5fade..ff5431cc03ee 100644 --- a/arch/x86/kernel/ptrace_32.c +++ b/arch/x86/kernel/ptrace_32.c @@ -632,7 +632,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) /* User-mode eip? */ info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; - /* Send us the fakey SIGTRAP */ + /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d769e204f942..a4ce1911efdf 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -45,9 +45,12 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) if (!(config & 0x2)) pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, + quirk_intel_irqbalance); #endif #if defined(CONFIG_HPET_TIMER) @@ -56,7 +59,8 @@ unsigned long force_hpet_address; static enum { NONE_FORCE_HPET_RESUME, OLD_ICH_FORCE_HPET_RESUME, - ICH_FORCE_HPET_RESUME + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME } force_hpet_resume_type; static void __iomem *rcba_base; @@ -146,17 +150,17 @@ static void ich_force_enable_hpet(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, - ich_force_enable_hpet); + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, - ich_force_enable_hpet); + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, - ich_force_enable_hpet); + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, - ich_force_enable_hpet); + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, - ich_force_enable_hpet); + ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, - ich_force_enable_hpet); + ich_force_enable_hpet); static struct pci_dev *cached_dev; @@ -232,10 +236,91 @@ static void old_ich_force_enable_hpet(struct pci_dev *dev) printk(KERN_DEBUG "Failed to force enable HPET\n"); } +/* + * Undocumented chipset features. Make sure that the user enforced + * this. + */ +static void old_ich_force_enable_hpet_user(struct pci_dev *dev) +{ + if (hpet_force_user) + old_ich_force_enable_hpet(dev); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet_user); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, - old_ich_force_enable_hpet); + old_ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, - old_ich_force_enable_hpet); + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (!hpet_force_user || hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "HPET at base address 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + printk(KERN_DEBUG "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); + void force_hpet_resume(void) { @@ -246,6 +331,9 @@ void force_hpet_resume(void) case OLD_ICH_FORCE_HPET_RESUME: return old_ich_force_hpet_resume(); + case VT8237_FORCE_HPET_RESUME: + return vt8237_force_hpet_resume(); + default: break; } diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c index 368db2b9c5ac..776eb06b6512 100644 --- a/arch/x86/kernel/reboot_64.c +++ b/arch/x86/kernel/reboot_64.c @@ -11,6 +11,7 @@ #include <linux/sched.h> #include <asm/io.h> #include <asm/delay.h> +#include <asm/desc.h> #include <asm/hw_irq.h> #include <asm/system.h> #include <asm/pgtable.h> @@ -136,7 +137,7 @@ void machine_emergency_restart(void) } case BOOT_TRIPLE: - __asm__ __volatile__("lidt (%0)": :"r" (&no_idt)); + load_idt((const struct desc_ptr *)&no_idt); __asm__ __volatile__("int3"); reboot_type = BOOT_KBD; diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 8b30b26ad069..1a07bbea7be3 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -12,6 +12,7 @@ #include <linux/interrupt.h> #include <asm/reboot_fixups.h> #include <asm/msr.h> +#include <asm/geode.h> static void cs5530a_warm_reset(struct pci_dev *dev) { @@ -24,11 +25,8 @@ static void cs5530a_warm_reset(struct pci_dev *dev) static void cs5536_warm_reset(struct pci_dev *dev) { - /* - * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET) - * writing 1 to the LSB of this MSR causes a hard reset. - */ - wrmsrl(0x51400017, 1ULL); + /* writing 1 to the LSB of this MSR causes a hard reset */ + wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL); udelay(50); /* shouldn't get here but be safe and spin a while */ } diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index ba9188235057..3558ac78c926 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -185,6 +185,12 @@ void __cpuinit check_efer(void) unsigned long kernel_eflags; /* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a @@ -224,8 +230,8 @@ void __cpuinit cpu_init (void) memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); cpu_gdt_descr[cpu].size = GDT_SIZE; - asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); - asm volatile("lidt %0" :: "m" (idt_descr)); + load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); + load_idt((const struct desc_ptr *)&idt_descr); memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index b87a6fd5ba48..cc0e91447b76 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -60,6 +60,7 @@ #include <asm/vmi.h> #include <setup_arch.h> #include <bios_ebda.h> +#include <asm/cacheflush.h> /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* @@ -73,6 +74,7 @@ int disable_pse __devinitdata = 0; */ extern struct resource code_resource; extern struct resource data_resource; +extern struct resource bss_resource; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -378,6 +380,49 @@ extern unsigned long __init setup_memory(void); extern void zone_sizes_init(void); #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +static inline unsigned long long get_total_mem(void) +{ + unsigned long long total; + + total = max_low_pfn - min_low_pfn; +#ifdef CONFIG_HIGHMEM + total += highend_pfn - highstart_pfn; +#endif + + return total << PAGE_SHIFT; +} + +#ifdef CONFIG_KEXEC +static void __init reserve_crashkernel(void) +{ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + int ret; + + total_mem = get_total_mem(); + + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base > 0) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + reserve_bootmem(crash_base, crash_size); + } else + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + } +} +#else +static inline void __init reserve_crashkernel(void) +{} +#endif + void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; @@ -453,11 +498,7 @@ void __init setup_bootmem_allocator(void) } } #endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) - reserve_bootmem(crashk_res.start, - crashk_res.end - crashk_res.start + 1); -#endif + reserve_crashkernel(); } /* @@ -561,6 +602,8 @@ void __init setup_arch(char **cmdline_p) code_resource.end = virt_to_phys(_etext)-1; data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; + bss_resource.start = virt_to_phys(&__bss_start); + bss_resource.end = virt_to_phys(&__bss_stop)-1; parse_early_param(); @@ -585,7 +628,7 @@ void __init setup_arch(char **cmdline_p) /* * NOTE: before this point _nobody_ is allowed to allocate * any memory using the bootmem allocator. Although the - * alloctor is now initialised only the first 8Mb of the kernel + * allocator is now initialised only the first 8Mb of the kernel * virtual address space has been mapped. All allocations before * paging_init() has completed must use the alloc_bootmem_low_pages() * variant (which allocates DMA'able memory) and care must be taken @@ -622,9 +665,7 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_PCI -#ifdef CONFIG_X86_IO_APIC - check_acpi_pci(); /* Checks more than just ACPI actually */ -#endif + early_quirks(); #endif #ifdef CONFIG_ACPI diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 5a19f0cc5b67..e7a9e36bd52d 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -58,6 +58,7 @@ #include <asm/numa.h> #include <asm/sections.h> #include <asm/dmi.h> +#include <asm/cacheflush.h> /* * Machine setup.. @@ -133,6 +134,12 @@ struct resource code_resource = { .end = 0, .flags = IORESOURCE_RAM, }; +struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_RAM, +}; #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header @@ -191,6 +198,37 @@ static inline void copy_edd(void) } #endif +#ifdef CONFIG_KEXEC +static void __init reserve_crashkernel(void) +{ + unsigned long long free_mem; + unsigned long long crash_size, crash_base; + int ret; + + free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + + ret = parse_crashkernel(boot_command_line, free_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size) { + if (crash_base > 0) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(free_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + reserve_bootmem(crash_base, crash_size); + } else + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + } +} +#else +static inline void __init reserve_crashkernel(void) +{} +#endif + #define EBDA_ADDR_POINTER 0x40E unsigned __initdata ebda_addr; @@ -245,6 +283,8 @@ void __init setup_arch(char **cmdline_p) code_resource.end = virt_to_phys(&_etext)-1; data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; + bss_resource.start = virt_to_phys(&__bss_start); + bss_resource.end = virt_to_phys(&__bss_stop)-1; early_identify_cpu(&boot_cpu_data); @@ -271,6 +311,11 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); +#ifdef CONFIG_SMP + /* setup to use the static apicid table during kernel startup */ + x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; +#endif + #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). @@ -357,13 +402,7 @@ void __init setup_arch(char **cmdline_p) } } #endif -#ifdef CONFIG_KEXEC - if (crashk_res.start != crashk_res.end) { - reserve_bootmem_generic(crashk_res.start, - crashk_res.end - crashk_res.start + 1); - } -#endif - + reserve_crashkernel(); paging_init(); #ifdef CONFIG_PCI @@ -529,7 +568,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) but in the same order as the HT nodeids. If that doesn't result in a usable node fall back to the path for the previous case. */ - int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); if (ht_nodeid >= 0 && apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = apicid_to_node[ht_nodeid]; @@ -853,6 +892,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; + c->cpu_index = 0; #endif } @@ -959,6 +999,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; + int cpu = 0; /* * These flag bits must match the definitions in <asm/cpufeature.h>. @@ -1037,8 +1078,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_SMP - if (!cpu_online(c-cpu_data)) + if (!cpu_online(c->cpu_index)) return 0; + cpu = c->cpu_index; #endif seq_printf(m,"processor\t: %u\n" @@ -1046,7 +1088,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "cpu family\t: %d\n" "model\t\t: %d\n" "model name\t: %s\n", - (unsigned)(c-cpu_data), + (unsigned)cpu, c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", c->x86, (int)c->x86_model, @@ -1058,7 +1100,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: unknown\n"); if (cpu_has(c,X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); + unsigned int freq = cpufreq_quick_get((unsigned)cpu); if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", @@ -1071,7 +1113,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { - int cpu = c - cpu_data; seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", cpus_weight(per_cpu(cpu_core_map, cpu))); @@ -1129,12 +1170,16 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < NR_CPUS ? cpu_data + *pos : NULL; + if (*pos == 0) /* just in case, cpu 0 is not the first */ + *pos = first_cpu(cpu_possible_map); + if ((*pos) < NR_CPUS && cpu_possible(*pos)) + return &cpu_data(*pos); + return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - ++*pos; + *pos = next_cpu(*pos, cpu_possible_map); return c_start(m, pos); } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 0d79df3c5631..9bdd83022f5f 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -200,8 +200,8 @@ badframe: if (show_unhandled_signals && printk_ratelimit()) printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" " esp:%lx oeax:%lx\n", - current->pid > 1 ? KERN_INFO : KERN_EMERG, - current->comm, current->pid, frame, regs->eip, + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + current->comm, task_pid_nr(current), frame, regs->eip, regs->esp, regs->orig_eax); force_sig(SIGSEGV, current); @@ -594,7 +594,7 @@ static void fastcall do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* Reenable any watchpoints before delivering the + /* Re-enable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered * inside the kernel. diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 683802bec419..ab086b0357fc 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -410,7 +410,7 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* Reenable any watchpoints before delivering the + /* Re-enable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered * inside the kernel. diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c index 791d9f8036ae..f32115308399 100644 --- a/arch/x86/kernel/smp_32.c +++ b/arch/x86/kernel/smp_32.c @@ -69,7 +69,7 @@ * * B stepping CPUs may hang. There are hardware work arounds * for this. We warn about it in case your board doesn't have the work - * arounds. Basically thats so I can tell anyone with a B stepping + * arounds. Basically that's so I can tell anyone with a B stepping * CPU and SMP problems "tough". * * Specific items [From Pentium Processor Specification Update] @@ -273,7 +273,7 @@ void leave_mm(unsigned long cpu) * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); * Stop ipi delivery for the old mm. This is not synchronized with * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superflous + * for the wrong mm, and in the worst case we perform a superfluous * tlb flush. * 1a2) set cpu_tlbstate to TLBSTATE_OK * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 @@ -610,7 +610,7 @@ static void stop_this_cpu (void * dummy) */ cpu_clear(smp_processor_id(), cpu_online_map); disable_local_APIC(); - if (cpu_data[smp_processor_id()].hlt_works_ok) + if (cpu_data(smp_processor_id()).hlt_works_ok) for(;;) halt(); for (;;); } @@ -676,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id) int i; for (i = 0; i < NR_CPUS; i++) { - if (x86_cpu_to_apicid[i] == apic_id) + if (per_cpu(x86_cpu_to_apicid, i) == apic_id) return i; } return -1; diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c index 5c2964727d19..03fa6ed559c6 100644 --- a/arch/x86/kernel/smp_64.c +++ b/arch/x86/kernel/smp_64.c @@ -322,17 +322,27 @@ void unlock_ipi_call_lock(void) } /* - * this function sends a 'generic call function' IPI to one other CPU - * in the system. - * - * cpu is a standard Linux logical CPU number. + * this function sends a 'generic call function' IPI to all other CPU + * of the system defined in the mask. */ -static void -__smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) + +static int +__smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; - int cpus = 1; + cpumask_t allbutself; + int cpus; + + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + + if (!cpus) + return 0; data.func = func; data.info = info; @@ -343,19 +353,55 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info, call_data = &data; wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + send_IPI_mask(mask, CALL_FUNCTION_VECTOR); /* Wait for response */ while (atomic_read(&data.started) != cpus) cpu_relax(); if (!wait) - return; + return 0; while (atomic_read(&data.finished) != cpus) cpu_relax(); + + return 0; +} +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) +{ + int ret; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + spin_lock(&call_lock); + ret = __smp_call_function_mask(mask, func, info, wait); + spin_unlock(&call_lock); + return ret; } +EXPORT_SYMBOL(smp_call_function_mask); /* * smp_call_function_single - Run a function on a specific CPU @@ -374,6 +420,7 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, int nonatomic, int wait) { /* prevent preemption and reschedule on another processor */ + int ret; int me = get_cpu(); /* Can deadlock when called with interrupts disabled */ @@ -387,51 +434,14 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, return 0; } - spin_lock(&call_lock); - __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock(&call_lock); + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); + put_cpu(); - return 0; + return ret; } EXPORT_SYMBOL(smp_call_function_single); /* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ -static void __smp_call_function (void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus()-1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* * smp_call_function - run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. @@ -449,10 +459,7 @@ static void __smp_call_function (void (*func) (void *info), void *info, int smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait) { - spin_lock(&call_lock); - __smp_call_function(func,info,nonatomic,wait); - spin_unlock(&call_lock); - return 0; + return smp_call_function_mask(cpu_online_map, func, info, wait); } EXPORT_SYMBOL(smp_call_function); @@ -479,7 +486,7 @@ void smp_send_stop(void) /* Don't deadlock on the call lock in panic */ nolock = !spin_trylock(&call_lock); local_irq_save(flags); - __smp_call_function(stop_this_cpu, NULL, 0, 0); + __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0); if (!nolock) spin_unlock(&call_lock); disable_local_APIC(); diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c index be3faac04719..ef0f34ede1ab 100644 --- a/arch/x86/kernel/smpboot_32.c +++ b/arch/x86/kernel/smpboot_32.c @@ -67,7 +67,7 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ -int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; +DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); @@ -89,12 +89,20 @@ EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; /* Per CPU bogomips and other parameters */ -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_data); +DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = - { [0 ... NR_CPUS-1] = 0xff }; -EXPORT_SYMBOL(x86_cpu_to_apicid); +/* + * The following static array is used during kernel startup + * and the x86_cpu_to_apicid_ptr contains the address of the + * array during this time. Is it zeroed when the per_cpu + * data area is removed. + */ +u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = + { [0 ... NR_CPUS-1] = BAD_APICID }; +void *x86_cpu_to_apicid_ptr; +DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); u8 apicid_2_node[MAX_APICID]; @@ -150,9 +158,10 @@ void __init smp_alloc_memory(void) void __cpuinit smp_store_cpu_info(int id) { - struct cpuinfo_x86 *c = cpu_data + id; + struct cpuinfo_x86 *c = &cpu_data(id); *c = boot_cpu_data; + c->cpu_index = id; if (id!=0) identify_secondary_cpu(c); /* @@ -294,7 +303,7 @@ static int cpucount; /* maps the cpu to the sched domain representing multi-core */ cpumask_t cpu_coregroup_map(int cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); /* * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map @@ -311,41 +320,41 @@ static cpumask_t cpu_sibling_setup_map; void __cpuinit set_cpu_sibling_map(int cpu) { int i; - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(cpu); cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (c[cpu].phys_proc_id == c[i].phys_proc_id && - c[cpu].cpu_core_id == c[i].cpu_core_id) { + if (c->phys_proc_id == cpu_data(i).phys_proc_id && + c->cpu_core_id == cpu_data(i).cpu_core_id) { cpu_set(i, per_cpu(cpu_sibling_map, cpu)); cpu_set(cpu, per_cpu(cpu_sibling_map, i)); cpu_set(i, per_cpu(cpu_core_map, cpu)); cpu_set(cpu, per_cpu(cpu_core_map, i)); - cpu_set(i, c[cpu].llc_shared_map); - cpu_set(cpu, c[i].llc_shared_map); + cpu_set(i, c->llc_shared_map); + cpu_set(cpu, cpu_data(i).llc_shared_map); } } } else { cpu_set(cpu, per_cpu(cpu_sibling_map, cpu)); } - cpu_set(cpu, c[cpu].llc_shared_map); + cpu_set(cpu, c->llc_shared_map); if (current_cpu_data.x86_max_cores == 1) { per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu); - c[cpu].booted_cores = 1; + c->booted_cores = 1; return; } for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (cpu_llc_id[cpu] != BAD_APICID && - cpu_llc_id[cpu] == cpu_llc_id[i]) { - cpu_set(i, c[cpu].llc_shared_map); - cpu_set(cpu, c[i].llc_shared_map); + if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { + cpu_set(i, c->llc_shared_map); + cpu_set(cpu, cpu_data(i).llc_shared_map); } - if (c[cpu].phys_proc_id == c[i].phys_proc_id) { + if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpu_set(i, per_cpu(cpu_core_map, cpu)); cpu_set(cpu, per_cpu(cpu_core_map, i)); /* @@ -357,15 +366,15 @@ void __cpuinit set_cpu_sibling_map(int cpu) * the booted_cores for this new cpu */ if (first_cpu(per_cpu(cpu_sibling_map, i)) == i) - c[cpu].booted_cores++; + c->booted_cores++; /* * increment the core count for all * the other cpus in this package */ if (i != cpu) - c[i].booted_cores++; - } else if (i != cpu && !c[cpu].booted_cores) - c[cpu].booted_cores = c[i].booted_cores; + cpu_data(i).booted_cores++; + } else if (i != cpu && !c->booted_cores) + c->booted_cores = cpu_data(i).booted_cores; } } } @@ -412,7 +421,7 @@ static void __cpuinit start_secondary(void *unused) /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of - * IPI receipients, and the time when the determination is made + * IPI recipients, and the time when the determination is made * for which cpus receive the IPI. Holding this * lock helps us to not include this cpu in a currently in progress * smp_call_function(). @@ -804,7 +813,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) irq_ctx_init(cpu); - x86_cpu_to_apicid[cpu] = apicid; + per_cpu(x86_cpu_to_apicid, cpu) = apicid; /* * This grunge runs the startup process for * the targeted processor. @@ -844,7 +853,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) /* number CPUs logically, starting from 1 (BSP is 0) */ Dprintk("OK.\n"); printk("CPU%d: ", cpu); - print_cpu_info(&cpu_data[cpu]); + print_cpu_info(&cpu_data(cpu)); Dprintk("CPU has booted.\n"); } else { boot_error= 1; @@ -866,7 +875,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ cpucount--; } else { - x86_cpu_to_apicid[cpu] = apicid; + per_cpu(x86_cpu_to_apicid, cpu) = apicid; cpu_set(cpu, cpu_present_map); } @@ -915,7 +924,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu) struct warm_boot_cpu_info info; int apicid, ret; - apicid = x86_cpu_to_apicid[cpu]; + apicid = per_cpu(x86_cpu_to_apicid, cpu); if (apicid == BAD_APICID) { ret = -ENODEV; goto exit; @@ -961,11 +970,11 @@ static void __init smp_boot_cpus(unsigned int max_cpus) */ smp_store_cpu_info(0); /* Final full version of the data */ printk("CPU%d: ", 0); - print_cpu_info(&cpu_data[0]); + print_cpu_info(&cpu_data(0)); boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); boot_cpu_logical_apicid = logical_smp_processor_id(); - x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; + per_cpu(x86_cpu_to_apicid, 0) = boot_cpu_physical_apicid; current_thread_info()->cpu = 0; @@ -1008,6 +1017,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); smpboot_clear_io_apic_irqs(); phys_cpu_present_map = physid_mask_of_physid(0); + map_cpu_to_logical_apicid(); cpu_set(0, per_cpu(cpu_sibling_map, 0)); cpu_set(0, per_cpu(cpu_core_map, 0)); return; @@ -1029,6 +1039,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) } smpboot_clear_io_apic_irqs(); phys_cpu_present_map = physid_mask_of_physid(0); + map_cpu_to_logical_apicid(); cpu_set(0, per_cpu(cpu_sibling_map, 0)); cpu_set(0, per_cpu(cpu_core_map, 0)); return; @@ -1082,7 +1093,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) Dprintk("Before bogomips.\n"); for (cpu = 0; cpu < NR_CPUS; cpu++) if (cpu_isset(cpu, cpu_callout_map)) - bogosum += cpu_data[cpu].loops_per_jiffy; + bogosum += cpu_data(cpu).loops_per_jiffy; printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", cpucount+1, @@ -1152,7 +1163,7 @@ void __init native_smp_prepare_boot_cpu(void) void remove_siblinginfo(int cpu) { int sibling; - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(cpu); for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); @@ -1160,15 +1171,15 @@ void remove_siblinginfo(int cpu) * last thread sibling in this cpu core going down */ if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) - c[sibling].booted_cores--; + cpu_data(sibling).booted_cores--; } for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); - c[cpu].phys_proc_id = 0; - c[cpu].cpu_core_id = 0; + c->phys_proc_id = 0; + c->cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); } diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index e351ac4ab5b1..b7e768dd87c9 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -65,7 +65,7 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ -u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; +DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -84,8 +84,8 @@ cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); /* Per CPU bogomips and other parameters */ -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_data); +DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); /* Set when the idlers are all forked */ int smp_threads_ready; @@ -138,9 +138,10 @@ static unsigned long __cpuinit setup_trampoline(void) static void __cpuinit smp_store_cpu_info(int id) { - struct cpuinfo_x86 *c = cpu_data + id; + struct cpuinfo_x86 *c = &cpu_data(id); *c = boot_cpu_data; + c->cpu_index = id; identify_cpu(c); print_cpu_info(c); } @@ -237,7 +238,7 @@ void __cpuinit smp_callin(void) /* maps the cpu to the sched domain representing multi-core */ cpumask_t cpu_coregroup_map(int cpu) { - struct cpuinfo_x86 *c = cpu_data + cpu; + struct cpuinfo_x86 *c = &cpu_data(cpu); /* * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map @@ -254,41 +255,41 @@ static cpumask_t cpu_sibling_setup_map; static inline void set_cpu_sibling_map(int cpu) { int i; - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(cpu); cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (c[cpu].phys_proc_id == c[i].phys_proc_id && - c[cpu].cpu_core_id == c[i].cpu_core_id) { + if (c->phys_proc_id == cpu_data(i).phys_proc_id && + c->cpu_core_id == cpu_data(i).cpu_core_id) { cpu_set(i, per_cpu(cpu_sibling_map, cpu)); cpu_set(cpu, per_cpu(cpu_sibling_map, i)); cpu_set(i, per_cpu(cpu_core_map, cpu)); cpu_set(cpu, per_cpu(cpu_core_map, i)); - cpu_set(i, c[cpu].llc_shared_map); - cpu_set(cpu, c[i].llc_shared_map); + cpu_set(i, c->llc_shared_map); + cpu_set(cpu, cpu_data(i).llc_shared_map); } } } else { cpu_set(cpu, per_cpu(cpu_sibling_map, cpu)); } - cpu_set(cpu, c[cpu].llc_shared_map); + cpu_set(cpu, c->llc_shared_map); if (current_cpu_data.x86_max_cores == 1) { per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu); - c[cpu].booted_cores = 1; + c->booted_cores = 1; return; } for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (cpu_llc_id[cpu] != BAD_APICID && - cpu_llc_id[cpu] == cpu_llc_id[i]) { - cpu_set(i, c[cpu].llc_shared_map); - cpu_set(cpu, c[i].llc_shared_map); + if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { + cpu_set(i, c->llc_shared_map); + cpu_set(cpu, cpu_data(i).llc_shared_map); } - if (c[cpu].phys_proc_id == c[i].phys_proc_id) { + if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpu_set(i, per_cpu(cpu_core_map, cpu)); cpu_set(cpu, per_cpu(cpu_core_map, i)); /* @@ -300,15 +301,15 @@ static inline void set_cpu_sibling_map(int cpu) * the booted_cores for this new cpu */ if (first_cpu(per_cpu(cpu_sibling_map, i)) == i) - c[cpu].booted_cores++; + c->booted_cores++; /* * increment the core count for all * the other cpus in this package */ if (i != cpu) - c[i].booted_cores++; - } else if (i != cpu && !c[cpu].booted_cores) - c[cpu].booted_cores = c[i].booted_cores; + cpu_data(i).booted_cores++; + } else if (i != cpu && !c->booted_cores) + c->booted_cores = cpu_data(i).booted_cores; } } } @@ -350,7 +351,7 @@ void __cpuinit start_secondary(void) /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of - * IPI receipients, and the time when the determination is made + * IPI recipients, and the time when the determination is made * for which cpus receive the IPI in genapic_flat.c. Holding this * lock helps us to not include this cpu in a currently in progress * smp_call_function(). @@ -694,7 +695,7 @@ do_rest: clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); - x86_cpu_to_apicid[cpu] = BAD_APICID; + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; return -EIO; } @@ -841,6 +842,26 @@ static int __init smp_sanity_check(unsigned max_cpus) } /* + * Copy apicid's found by MP_processor_info from initial array to the per cpu + * data area. The x86_cpu_to_apicid_init array is then expendable and the + * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no + * longer available. + */ +void __init smp_set_apicids(void) +{ + int cpu; + + for_each_cpu_mask(cpu, cpu_possible_map) { + if (per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_apicid, cpu) = + x86_cpu_to_apicid_init[cpu]; + } + + /* indicate the static array will be going away soon */ + x86_cpu_to_apicid_ptr = NULL; +} + +/* * Prepare for SMP bootup. The MP table or ACPI has been read * earlier. Just do some sanity checking here and enable APIC mode. */ @@ -849,6 +870,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ + smp_set_apicids(); set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { @@ -968,7 +990,7 @@ void __init smp_cpus_done(unsigned int max_cpus) static void remove_siblinginfo(int cpu) { int sibling; - struct cpuinfo_x86 *c = cpu_data; + struct cpuinfo_x86 *c = &cpu_data(cpu); for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); @@ -976,15 +998,15 @@ static void remove_siblinginfo(int cpu) * last thread sibling in this cpu core going down */ if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) - c[sibling].booted_cores--; + cpu_data(sibling).booted_cores--; } for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); - c[cpu].phys_proc_id = 0; - c[cpu].cpu_core_id = 0; + c->phys_proc_id = 0; + c->cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); } diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index 91c7acc8d999..72f463401592 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -64,7 +64,7 @@ static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) switch (rio_devs[wpeg_num]->type){ case CompatWPEG: - /* The Compatability Winnipeg controls the 2 legacy buses, + /* The Compatibility Winnipeg controls the 2 legacy buses, * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case * a PCI-PCI bridge card is used in either slot: total 5 buses. */ diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c index 573c0a6e0ac6..bc9f59c246fd 100644 --- a/arch/x86/kernel/suspend_64.c +++ b/arch/x86/kernel/suspend_64.c @@ -32,9 +32,9 @@ void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ - asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit)); - asm volatile ("sidt %0" : "=m" (ctxt->idt_limit)); - asm volatile ("str %0" : "=m" (ctxt->tr)); + store_gdt((struct desc_ptr *)&ctxt->gdt_limit); + store_idt((struct desc_ptr *)&ctxt->idt_limit); + store_tr(ctxt->tr); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ /* @@ -91,8 +91,9 @@ void __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ - asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit)); - asm volatile ("lidt %0" :: "m" (ctxt->idt_limit)); + load_gdt((const struct desc_ptr *)&ctxt->gdt_limit); + load_idt((const struct desc_ptr *)&ctxt->idt_limit); + /* * segment registers @@ -123,7 +124,7 @@ void fix_processor_context(void) int cpu = smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); - set_tss_desc(cpu,t); /* This just modifies memory; should not be neccessary. But... This is neccessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ + set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; @@ -150,8 +151,22 @@ void fix_processor_context(void) /* Defined in arch/x86_64/kernel/suspend_asm.S */ extern int restore_image(void); +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3; + pgd_t *temp_level4_pgt; +void *relocated_restore_code; + static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { long i, j; @@ -175,7 +190,7 @@ static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long en if (paddr >= end) break; - pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr; + pe = __PAGE_KERNEL_LARGE_EXEC | paddr; pe &= __supported_pte_mask; set_pmd(pmd, __pmd(pe)); } @@ -183,25 +198,42 @@ static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long en return 0; } +static int res_kernel_text_pud_init(pud_t *pud, unsigned long start) +{ + pmd_t *pmd; + unsigned long paddr; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + set_pud(pud + pud_index(start), __pud(__pa(pmd) | _KERNPG_TABLE)); + for (paddr = 0; paddr < KERNEL_TEXT_SIZE; pmd++, paddr += PMD_SIZE) { + unsigned long pe; + + pe = __PAGE_KERNEL_LARGE_EXEC | _PAGE_GLOBAL | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + + return 0; +} + static int set_up_temporary_mappings(void) { unsigned long start, end, next; + pud_t *pud; int error; temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); if (!temp_level4_pgt) return -ENOMEM; - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); - /* Set up the direct mapping from scratch */ start = (unsigned long)pfn_to_kaddr(0); end = (unsigned long)pfn_to_kaddr(end_pfn); for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; next = start + PGDIR_SIZE; @@ -212,7 +244,17 @@ static int set_up_temporary_mappings(void) set_pgd(temp_level4_pgt + pgd_index(start), mk_kernel_pgd(__pa(pud))); } - return 0; + + /* Set up the kernel text mapping from scratch */ + pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + error = res_kernel_text_pud_init(pud, __START_KERNEL_map); + if (!error) + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + __pgd(__pa(pud) | _PAGE_TABLE)); + + return error; } int swsusp_arch_resume(void) @@ -222,6 +264,13 @@ int swsusp_arch_resume(void) /* We have got enough memory and from now on we cannot recover */ if ((error = set_up_temporary_mappings())) return error; + + relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + memcpy(relocated_restore_code, &core_restore_code, + &restore_registers - &core_restore_code); + restore_image(); return 0; } @@ -236,4 +285,43 @@ int pfn_is_nosave(unsigned long pfn) unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } + +struct restore_data_record { + unsigned long jump_address; + unsigned long cr3; + unsigned long magic; +}; + +#define RESTORE_MAGIC 0x0123456789ABCDEFUL + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->jump_address = restore_jump_address; + rdr->cr3 = restore_cr3; + rdr->magic = RESTORE_MAGIC; + return 0; +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + restore_jump_address = rdr->jump_address; + restore_cr3 = rdr->cr3; + return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; +} #endif /* CONFIG_HIBERNATION */ diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S index 16d183f67bc1..48344b666d2c 100644 --- a/arch/x86/kernel/suspend_asm_64.S +++ b/arch/x86/kernel/suspend_asm_64.S @@ -2,8 +2,8 @@ * * Distribute under GPLv2. * - * swsusp_arch_resume may not use any stack, nor any variable that is - * not "NoSave" during copying pages: + * swsusp_arch_resume must not use any stack or any nonlocal variables while + * copying pages: * * Its rewriting one kernel image with another. What is stack in "old" * image could very well be data page in "new" image, and overwriting @@ -36,6 +36,13 @@ ENTRY(swsusp_arch_suspend) movq %r15, saved_context_r15(%rip) pushfq ; popq saved_context_eflags(%rip) + /* save the address of restore_registers */ + movq $restore_registers, %rax + movq %rax, restore_jump_address(%rip) + /* save cr3 */ + movq %cr3, %rax + movq %rax, restore_cr3(%rip) + call swsusp_save ret @@ -54,7 +61,17 @@ ENTRY(restore_image) movq %rcx, %cr3; movq %rax, %cr4; # turn PGE back on + /* prepare to jump to the image kernel */ + movq restore_jump_address(%rip), %rax + movq restore_cr3(%rip), %rbx + + /* prepare to copy image data to their original locations */ movq restore_pblist(%rip), %rdx + movq relocated_restore_code(%rip), %rcx + jmpq *%rcx + + /* code below has been relocated to a safe page */ +ENTRY(core_restore_code) loop: testq %rdx, %rdx jz done @@ -62,7 +79,7 @@ loop: /* get addresses from the pbe and copy the page */ movq pbe_address(%rdx), %rsi movq pbe_orig_address(%rdx), %rdi - movq $512, %rcx + movq $(PAGE_SIZE >> 3), %rcx rep movsq @@ -70,10 +87,22 @@ loop: movq pbe_next(%rdx), %rdx jmp loop done: + /* jump to the restore_registers address from the image header */ + jmpq *%rax + /* + * NOTE: This assumes that the boot kernel's text mapping covers the + * image kernel's page containing restore_registers and the address of + * this page is the same as in the image kernel's text mapping (it + * should always be true, because the text mapping is linear, starting + * from 0, and is supposed to cover the entire kernel text for every + * kernel). + * + * code below belongs to the image kernel + */ + +ENTRY(restore_registers) /* go back to the original page tables */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax - addq phys_base(%rip), %rax - movq %rax, %cr3 + movq %rbx, %cr3 /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax @@ -84,12 +113,9 @@ done: movq %rcx, %cr3 movq %rax, %cr4; # turn PGE back on - movl $24, %eax - movl %eax, %ds - movq saved_context_esp(%rip), %rsp movq saved_context_ebp(%rip), %rbp - /* Don't restore %rax, it must be 0 anyway */ + /* restore GPRs (we don't restore %rax, it must be 0 anyway) */ movq saved_context_ebx(%rip), %rbx movq saved_context_ecx(%rip), %rcx movq saved_context_edx(%rip), %rdx @@ -107,4 +133,7 @@ done: xorq %rax, %rax + /* tell the hibernation core that we've just restored the memory */ + movq %rax, in_suspend(%rip) + ret diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index b132d3957dfc..cc9acace7e23 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -63,6 +63,9 @@ int panic_on_unrecovered_nmi; +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@ -288,48 +291,24 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = 1; - unsigned long esp; - unsigned short ss, gs; - - esp = (unsigned long) (®s->esp); - savesegment(ss, ss); - savesegment(gs, gs); - if (user_mode_vm(regs)) { - in_kernel = 0; - esp = regs->esp; - ss = regs->xss & 0xffff; - } + print_modules(); - printk(KERN_EMERG "CPU: %d\n" - KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" - KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", - smp_processor_id(), 0xffff & regs->xcs, regs->eip, - print_tainted(), regs->eflags, init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); - printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); - printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); + __show_registers(regs, 0); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, current->pid, + TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (in_kernel) { + if (!user_mode_vm(regs)) { u8 *eip; unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG); printk(KERN_EMERG "Code: "); @@ -374,11 +353,11 @@ int is_valid_bugaddr(unsigned long eip) void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED, .lock_owner = -1, .lock_owner_depth = 0 }; @@ -389,13 +368,14 @@ void die(const char * str, struct pt_regs * regs, long err) if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); - spin_lock_irqsave(&die.lock, flags); + __raw_spin_lock(&die.lock); + raw_local_save_flags(flags); die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); } else - local_save_flags(flags); + raw_local_save_flags(flags); if (++die.lock_owner_depth < 3) { unsigned long esp; @@ -439,7 +419,8 @@ void die(const char * str, struct pt_regs * regs, long err) bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); - spin_unlock_irqrestore(&die.lock, flags); + __raw_spin_unlock(&die.lock); + raw_local_irq_restore(flags); if (!regs) return; @@ -622,7 +603,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, printk_ratelimit()) printk(KERN_INFO "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), regs->eip, regs->esp, error_code); force_sig(SIGSEGV, current); @@ -1142,6 +1123,8 @@ static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) void __init trap_init(void) { + int i; + #ifdef CONFIG_EISA void __iomem *p = ioremap(0x0FFFD9, 4); if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { @@ -1201,6 +1184,11 @@ void __init trap_init(void) set_system_gate(SYSCALL_VECTOR,&system_call); + /* Reserve all the builtin and the syscall vector. */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + set_bit(SYSCALL_VECTOR, used_vectors); + /* * Should be a barrier for any external CPU state. */ diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index b4a9b3db1994..d0c2bc7ab2ec 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -201,7 +201,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, #define MSG(txt) ops->warning(data, txt) /* - * x86-64 can have upto three kernel stacks: + * x86-64 can have up to three kernel stacks: * process stack * interrupt stack * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack @@ -462,7 +462,7 @@ void out_of_line_bug(void) EXPORT_SYMBOL(out_of_line_bug); #endif -static DEFINE_SPINLOCK(die_lock); +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -474,13 +474,13 @@ unsigned __kprobes long oops_begin(void) oops_enter(); /* racy, but better than risking deadlock. */ - local_irq_save(flags); + raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!spin_trylock(&die_lock)) { + if (!__raw_spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - spin_lock(&die_lock); + __raw_spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -494,12 +494,10 @@ void __kprobes oops_end(unsigned long flags) die_owner = -1; bust_spinlocks(0); die_nest_count--; - if (die_nest_count) - /* We still own the lock */ - local_irq_restore(flags); - else + if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - spin_unlock_irqrestore(&die_lock, flags); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); if (panic_on_oops) panic("Fatal exception"); oops_exit(); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index e87a3939ed40..d78444c788a3 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -59,7 +59,7 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); -/* Accellerators for sched_clock() +/* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) @@ -74,7 +74,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); * And since SC is a constant power of two, we can convert the div * into a shift. * - * We can use khz divisor instead of mhz to keep a better percision, since + * We can use khz divisor instead of mhz to keep a better precision, since * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. * (mathieu.desnoyers@polymtl.ca) * @@ -181,8 +181,8 @@ int recalibrate_cpu_khz(void) if (cpu_has_tsc) { cpu_khz = calculate_cpu_khz(); tsc_khz = cpu_khz; - cpu_data[0].loops_per_jiffy = - cpufreq_scale(cpu_data[0].loops_per_jiffy, + cpu_data(0).loops_per_jiffy = + cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); return 0; } else @@ -215,7 +215,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) return 0; } ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; + loops_per_jiffy_ref = cpu_data(freq->cpu).loops_per_jiffy; cpu_khz_ref = cpu_khz; } @@ -223,7 +223,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || (val == CPUFREQ_RESUMECHANGE)) { if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = + cpu_data(freq->cpu).loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 9f22e542c374..9c70af45b42b 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -73,13 +73,13 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_freqs *freq = data; unsigned long *lpj, dummy; - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) return 0; lpj = &dummy; if (!(freq->flags & CPUFREQ_CONST_LOOPS)) #ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; + lpj = &cpu_data(freq->cpu).loops_per_jiffy; #else lpj = &boot_cpu_data.loops_per_jiffy; #endif diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8a67e282cb5e..ad4005c6d4a1 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -48,12 +48,12 @@ ({unsigned long v; \ extern char __vsyscall_0; \ asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) + ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) /* * vsyscall_gtod_data contains data that is : * - readonly from vsyscalls - * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) + * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) * Try to keep this structure as small as possible to avoid cache line ping pongs */ int __vgetcpu_mode __section_vgetcpu_mode; @@ -64,6 +64,16 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = .sysctl_enabled = 1, }; +void update_vsyscall_tz(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* sys_tz has changed */ + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} + void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { unsigned long flags; @@ -77,7 +87,6 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.sys_tz = sys_tz; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -163,7 +172,7 @@ time_t __vsyscall(1) vtime(time_t *t) if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, 0); + vgettimeofday(&tv, NULL); result = tv.tv_sec; if (t) *t = result; @@ -257,18 +266,10 @@ out: return ret; } -static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - static ctl_table kernel_table2[] = { - { .ctl_name = 99, .procname = "vsyscall64", + { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, {} }; @@ -290,7 +291,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu) #ifdef CONFIG_NUMA node = cpu_to_node(cpu); #endif - if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); /* Store cpu number in limit so that it can be loaded quickly diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig new file mode 100644 index 000000000000..c4dffbeea5e1 --- /dev/null +++ b/arch/x86/lguest/Kconfig @@ -0,0 +1,14 @@ +config LGUEST_GUEST + bool "Lguest guest support" + select PARAVIRT + depends on !X86_PAE + select VIRTIO + select VIRTIO_RING + select VIRTIO_CONSOLE + help + Lguest is a tiny in-kernel hypervisor. Selecting this will + allow your kernel to boot under lguest. This option will increase + your kernel size by about 6k. If in doubt, say N. + + If you say Y here, make sure you say Y (or M) to the virtio block + and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile new file mode 100644 index 000000000000..27f0c9ed7f60 --- /dev/null +++ b/arch/x86/lguest/Makefile @@ -0,0 +1 @@ +obj-y := i386_head.o boot.o diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c new file mode 100644 index 000000000000..d2235db4085f --- /dev/null +++ b/arch/x86/lguest/boot.c @@ -0,0 +1,1070 @@ +/*P:010 + * A hypervisor allows multiple Operating Systems to run on a single machine. + * To quote David Wheeler: "Any problem in computer science can be solved with + * another layer of indirection." + * + * We keep things simple in two ways. First, we start with a normal Linux + * kernel and insert a module (lg.ko) which allows us to run other Linux + * kernels the same way we'd run processes. We call the first kernel the Host, + * and the others the Guests. The program which sets up and configures Guests + * (such as the example in Documentation/lguest/lguest.c) is called the + * Launcher. + * + * Secondly, we only run specially modified Guests, not normal kernels. When + * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets + * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows + * how to be a Guest. This means that you can use the same kernel you boot + * normally (ie. as a Host) as a Guest. + * + * These Guests know that they cannot do privileged operations, such as disable + * interrupts, and that they have to ask the Host to do such things explicitly. + * This file consists of all the replacements for such low-level native + * hardware operations: these special Guest versions call the Host. + * + * So how does the kernel know it's a Guest? The Guest starts at a special + * entry point marked with a magic string, which sets up a few things then + * calls here. We replace the native functions various "paravirt" structures + * with our Guest versions, then boot like normal. :*/ + +/* + * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/kernel.h> +#include <linux/start_kernel.h> +#include <linux/string.h> +#include <linux/console.h> +#include <linux/screen_info.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/lguest.h> +#include <linux/lguest_launcher.h> +#include <linux/virtio_console.h> +#include <asm/paravirt.h> +#include <asm/param.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/mce.h> +#include <asm/io.h> +#include <asm/i387.h> + +/*G:010 Welcome to the Guest! + * + * The Guest in our tale is a simple creature: identical to the Host but + * behaving in simplified but equivalent ways. In particular, the Guest is the + * same kernel as the Host (or at least, built from the same source code). :*/ + +/* Declarations for definitions in lguest_guest.S */ +extern char lguest_noirq_start[], lguest_noirq_end[]; +extern const char lgstart_cli[], lgend_cli[]; +extern const char lgstart_sti[], lgend_sti[]; +extern const char lgstart_popf[], lgend_popf[]; +extern const char lgstart_pushf[], lgend_pushf[]; +extern const char lgstart_iret[], lgend_iret[]; +extern void lguest_iret(void); + +struct lguest_data lguest_data = { + .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, + .noirq_start = (u32)lguest_noirq_start, + .noirq_end = (u32)lguest_noirq_end, + .kernel_address = PAGE_OFFSET, + .blocked_interrupts = { 1 }, /* Block timer interrupts */ + .syscall_vec = SYSCALL_VECTOR, +}; +static cycle_t clock_base; + +/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first + * real optimization trick! + * + * When lazy_mode is set, it means we're allowed to defer all hypercalls and do + * them as a batch when lazy_mode is eventually turned off. Because hypercalls + * are reasonably expensive, batching them up makes sense. For example, a + * large mmap might update dozens of page table entries: that code calls + * paravirt_enter_lazy_mmu(), does the dozen updates, then calls + * lguest_leave_lazy_mode(). + * + * So, when we're in lazy mode, we call async_hypercall() to store the call for + * future processing. When lazy mode is turned off we issue a hypercall to + * flush the stored calls. + */ +static void lguest_leave_lazy_mode(void) +{ + paravirt_leave_lazy(paravirt_get_lazy_mode()); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); +} + +static void lazy_hcall(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3) +{ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) + hcall(call, arg1, arg2, arg3); + else + async_hcall(call, arg1, arg2, arg3); +} + +/* async_hcall() is pretty simple: I'm quite proud of it really. We have a + * ring buffer of stored hypercalls which the Host will run though next time we + * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall + * arguments, and a "hcall_status" word which is 0 if the call is ready to go, + * and 255 once the Host has finished with it. + * + * If we come around to a slot which hasn't been finished, then the table is + * full and we just make the hypercall directly. This has the nice side + * effect of causing the Host to run all the stored calls in the ring buffer + * which empties it for next time! */ +void async_hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + /* Note: This code assumes we're uniprocessor. */ + static unsigned int next_call; + unsigned long flags; + + /* Disable interrupts if not already disabled: we don't want an + * interrupt handler making a hypercall while we're already doing + * one! */ + local_irq_save(flags); + if (lguest_data.hcall_status[next_call] != 0xFF) { + /* Table full, so do normal hcall which will flush table. */ + hcall(call, arg1, arg2, arg3); + } else { + lguest_data.hcalls[next_call].arg0 = call; + lguest_data.hcalls[next_call].arg1 = arg1; + lguest_data.hcalls[next_call].arg2 = arg2; + lguest_data.hcalls[next_call].arg3 = arg3; + /* Arguments must all be written before we mark it to go */ + wmb(); + lguest_data.hcall_status[next_call] = 0; + if (++next_call == LHCALL_RING_SIZE) + next_call = 0; + } + local_irq_restore(flags); +} +/*:*/ + +/*G:033 + * Here are our first native-instruction replacements: four functions for + * interrupt control. + * + * The simplest way of implementing these would be to have "turn interrupts + * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: + * these are by far the most commonly called functions of those we override. + * + * So instead we keep an "irq_enabled" field inside our "struct lguest_data", + * which the Guest can update with a single instruction. The Host knows to + * check there when it wants to deliver an interrupt. + */ + +/* save_flags() is expected to return the processor state (ie. "eflags"). The + * eflags word contains all kind of stuff, but in practice Linux only cares + * about the interrupt flag. Our "save_flags()" just returns that. */ +static unsigned long save_fl(void) +{ + return lguest_data.irq_enabled; +} + +/* "restore_flags" just sets the flags back to the value given. */ +static void restore_fl(unsigned long flags) +{ + lguest_data.irq_enabled = flags; +} + +/* Interrupts go off... */ +static void irq_disable(void) +{ + lguest_data.irq_enabled = 0; +} + +/* Interrupts go on... */ +static void irq_enable(void) +{ + lguest_data.irq_enabled = X86_EFLAGS_IF; +} +/*:*/ +/*M:003 Note that we don't check for outstanding interrupts when we re-enable + * them (or when we unmask an interrupt). This seems to work for the moment, + * since interrupts are rare and we'll just get the interrupt on the next timer + * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way + * would be to put the "irq_enabled" field in a page by itself, and have the + * Host write-protect it when an interrupt comes in when irqs are disabled. + * There will then be a page fault as soon as interrupts are re-enabled. :*/ + +/*G:034 + * The Interrupt Descriptor Table (IDT). + * + * The IDT tells the processor what to do when an interrupt comes in. Each + * entry in the table is a 64-bit descriptor: this holds the privilege level, + * address of the handler, and... well, who cares? The Guest just asks the + * Host to make the change anyway, because the Host controls the real IDT. + */ +static void lguest_write_idt_entry(struct desc_struct *dt, + int entrynum, u32 low, u32 high) +{ + /* Keep the local copy up to date. */ + write_dt_entry(dt, entrynum, low, high); + /* Tell Host about this new entry. */ + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); +} + +/* Changing to a different IDT is very rare: we keep the IDT up-to-date every + * time it is written, so we can simply loop through all entries and tell the + * Host about them. */ +static void lguest_load_idt(const struct Xgt_desc_struct *desc) +{ + unsigned int i; + struct desc_struct *idt = (void *)desc->address; + + for (i = 0; i < (desc->size+1)/8; i++) + hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); +} + +/* + * The Global Descriptor Table. + * + * The Intel architecture defines another table, called the Global Descriptor + * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" + * instruction, and then several other instructions refer to entries in the + * table. There are three entries which the Switcher needs, so the Host simply + * controls the entire thing and the Guest asks it to make changes using the + * LOAD_GDT hypercall. + * + * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY + * hypercall and use that repeatedly to load a new IDT. I don't think it + * really matters, but wouldn't it be nice if they were the same? + */ +static void lguest_load_gdt(const struct Xgt_desc_struct *desc) +{ + BUG_ON((desc->size+1)/8 != GDT_ENTRIES); + hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); +} + +/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, + * then tell the Host to reload the entire thing. This operation is so rare + * that this naive implementation is reasonable. */ +static void lguest_write_gdt_entry(struct desc_struct *dt, + int entrynum, u32 low, u32 high) +{ + write_dt_entry(dt, entrynum, low, high); + hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); +} + +/* OK, I lied. There are three "thread local storage" GDT entries which change + * on every context switch (these three entries are how glibc implements + * __thread variables). So we have a hypercall specifically for this case. */ +static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) +{ + /* There's one problem which normal hardware doesn't have: the Host + * can't handle us removing entries we're currently using. So we clear + * the GS register here: if it's needed it'll be reloaded anyway. */ + loadsegment(gs, 0); + lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); +} + +/*G:038 That's enough excitement for now, back to ploughing through each of + * the different pv_ops structures (we're about 1/3 of the way through). + * + * This is the Local Descriptor Table, another weird Intel thingy. Linux only + * uses this for some strange applications like Wine. We don't do anything + * here, so they'll get an informative and friendly Segmentation Fault. */ +static void lguest_set_ldt(const void *addr, unsigned entries) +{ +} + +/* This loads a GDT entry into the "Task Register": that entry points to a + * structure called the Task State Segment. Some comments scattered though the + * kernel code indicate that this used for task switching in ages past, along + * with blood sacrifice and astrology. + * + * Now there's nothing interesting in here that we don't get told elsewhere. + * But the native version uses the "ltr" instruction, which makes the Host + * complain to the Guest about a Segmentation Fault and it'll oops. So we + * override the native version with a do-nothing version. */ +static void lguest_load_tr_desc(void) +{ +} + +/* The "cpuid" instruction is a way of querying both the CPU identity + * (manufacturer, model, etc) and its features. It was introduced before the + * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you + * might imagine, after a decade and a half this treatment, it is now a giant + * ball of hair. Its entry in the current Intel manual runs to 28 pages. + * + * This instruction even it has its own Wikipedia entry. The Wikipedia entry + * has been translated into 4 languages. I am not making this up! + * + * We could get funky here and identify ourselves as "GenuineLguest", but + * instead we just use the real "cpuid" instruction. Then I pretty much turned + * off feature bits until the Guest booted. (Don't say that: you'll damage + * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is + * hardly future proof.) Noone's listening! They don't like you anyway, + * parenthetic weirdo! + * + * Replacing the cpuid so we can turn features off is great for the kernel, but + * anyone (including userspace) can just use the raw "cpuid" instruction and + * the Host won't even notice since it isn't privileged. So we try not to get + * too worked up about it. */ +static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + int function = *eax; + + native_cpuid(eax, ebx, ecx, edx); + switch (function) { + case 1: /* Basic feature request. */ + /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + *ecx &= 0x00002201; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ + *edx &= 0x07808101; + /* The Host can do a nice optimization if it knows that the + * kernel mappings (addresses above 0xC0000000 or whatever + * PAGE_OFFSET is set to) haven't changed. But Linux calls + * flush_tlb_user() for both user and kernel mappings unless + * the Page Global Enable (PGE) feature bit is set. */ + *edx |= 0x00002000; + break; + case 0x80000000: + /* Futureproof this a little: if they ask how much extended + * processor information there is, limit it to known fields. */ + if (*eax > 0x80000008) + *eax = 0x80000008; + break; + } +} + +/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. + * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother + * it. The Host needs to know when the Guest wants to change them, so we have + * a whole series of functions like read_cr0() and write_cr0(). + * + * We start with CR0. CR0 allows you to turn on and off all kinds of basic + * features, but Linux only really cares about one: the horrifically-named Task + * Switched (TS) bit at bit 3 (ie. 8) + * + * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if + * the floating point unit is used. Which allows us to restore FPU state + * lazily after a task switch, and Linux uses that gratefully, but wouldn't a + * name like "FPUTRAP bit" be a little less cryptic? + * + * We store cr0 (and cr3) locally, because the Host never changes it. The + * Guest sometimes wants to read it and we'd prefer not to bother the Host + * unnecessarily. */ +static unsigned long current_cr0, current_cr3; +static void lguest_write_cr0(unsigned long val) +{ + /* 8 == TS bit. */ + lazy_hcall(LHCALL_TS, val & 8, 0, 0); + current_cr0 = val; +} + +static unsigned long lguest_read_cr0(void) +{ + return current_cr0; +} + +/* Intel provided a special instruction to clear the TS bit for people too cool + * to use write_cr0() to do it. This "clts" instruction is faster, because all + * the vowels have been optimized out. */ +static void lguest_clts(void) +{ + lazy_hcall(LHCALL_TS, 0, 0, 0); + current_cr0 &= ~8U; +} + +/* CR2 is the virtual address of the last page fault, which the Guest only ever + * reads. The Host kindly writes this into our "struct lguest_data", so we + * just read it out of there. */ +static unsigned long lguest_read_cr2(void) +{ + return lguest_data.cr2; +} + +/* CR3 is the current toplevel pagetable page: the principle is the same as + * cr0. Keep a local copy, and tell the Host when it changes. */ +static void lguest_write_cr3(unsigned long cr3) +{ + lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); + current_cr3 = cr3; +} + +static unsigned long lguest_read_cr3(void) +{ + return current_cr3; +} + +/* CR4 is used to enable and disable PGE, but we don't care. */ +static unsigned long lguest_read_cr4(void) +{ + return 0; +} + +static void lguest_write_cr4(unsigned long val) +{ +} + +/* + * Page Table Handling. + * + * Now would be a good time to take a rest and grab a coffee or similarly + * relaxing stimulant. The easy parts are behind us, and the trek gradually + * winds uphill from here. + * + * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU + * maps virtual addresses to physical addresses using "page tables". We could + * use one huge index of 1 million entries: each address is 4 bytes, so that's + * 1024 pages just to hold the page tables. But since most virtual addresses + * are unused, we use a two level index which saves space. The CR3 register + * contains the physical address of the top level "page directory" page, which + * contains physical addresses of up to 1024 second-level pages. Each of these + * second level pages contains up to 1024 physical addresses of actual pages, + * or Page Table Entries (PTEs). + * + * Here's a diagram, where arrows indicate physical addresses: + * + * CR3 ---> +---------+ + * | --------->+---------+ + * | | | PADDR1 | + * Top-level | | PADDR2 | + * (PMD) page | | | + * | | Lower-level | + * | | (PTE) page | + * | | | | + * .... .... + * + * So to convert a virtual address to a physical address, we look up the top + * level, which points us to the second level, which gives us the physical + * address of that page. If the top level entry was not present, or the second + * level entry was not present, then the virtual address is invalid (we + * say "the page was not mapped"). + * + * Put another way, a 32-bit virtual address is divided up like so: + * + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| + * Index into top Index into second Offset within page + * page directory page pagetable page + * + * The kernel spends a lot of time changing both the top-level page directory + * and lower-level pagetable pages. The Guest doesn't know physical addresses, + * so while it maintains these page tables exactly like normal, it also needs + * to keep the Host informed whenever it makes a change: the Host will create + * the real page tables based on the Guests'. + */ + +/* The Guest calls this to set a second-level entry (pte), ie. to map a page + * into a process' address space. We set the entry then tell the Host the + * toplevel and address this corresponds to. The Guest uses one pagetable per + * process, so we need to tell the Host which one we're changing (mm->pgd). */ +static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; + lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low); +} + +/* The Guest calls this to set a top-level entry. Again, we set the entry then + * tell the Host which top-level page we changed, and the index of the entry we + * changed. */ +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + *pmdp = pmdval; + lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, + (__pa(pmdp)&(PAGE_SIZE-1))/4, 0); +} + +/* There are a couple of legacy places where the kernel sets a PTE, but we + * don't know the top level any more. This is useless for us, since we don't + * know which pagetable is changing or what address, so we just tell the Host + * to forget all of them. Fortunately, this is very rare. + * + * ... except in early boot when the kernel sets up the initial pagetables, + * which makes booting astonishingly slow. So we don't even tell the Host + * anything changed until we've done the first page table switch. + */ +static void lguest_set_pte(pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; + /* Don't bother with hypercall before initial setup. */ + if (current_cr3) + lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); +} + +/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on + * native page table operations. On native hardware you can set a new page + * table entry whenever you want, but if you want to remove one you have to do + * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). + * + * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only + * called when a valid entry is written, not when it's removed (ie. marked not + * present). Instead, this is where we come when the Guest wants to remove a + * page table entry: we tell the Host to set that entry to 0 (ie. the present + * bit is zero). */ +static void lguest_flush_tlb_single(unsigned long addr) +{ + /* Simply set it to zero: if it was not, it will fault back in. */ + lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0); +} + +/* This is what happens after the Guest has removed a large number of entries. + * This tells the Host that any of the page table entries for userspace might + * have changed, ie. virtual addresses below PAGE_OFFSET. */ +static void lguest_flush_tlb_user(void) +{ + lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0); +} + +/* This is called when the kernel page tables have changed. That's not very + * common (unless the Guest is using highmem, which makes the Guest extremely + * slow), so it's worth separating this from the user flushing above. */ +static void lguest_flush_tlb_kernel(void) +{ + lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0); +} + +/* + * The Unadvanced Programmable Interrupt Controller. + * + * This is an attempt to implement the simplest possible interrupt controller. + * I spent some time looking though routines like set_irq_chip_and_handler, + * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and + * I *think* this is as simple as it gets. + * + * We can tell the Host what interrupts we want blocked ready for using the + * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as + * simple as setting a bit. We don't actually "ack" interrupts as such, we + * just mask and unmask them. I wonder if we should be cleverer? + */ +static void disable_lguest_irq(unsigned int irq) +{ + set_bit(irq, lguest_data.blocked_interrupts); +} + +static void enable_lguest_irq(unsigned int irq) +{ + clear_bit(irq, lguest_data.blocked_interrupts); +} + +/* This structure describes the lguest IRQ controller. */ +static struct irq_chip lguest_irq_controller = { + .name = "lguest", + .mask = disable_lguest_irq, + .mask_ack = disable_lguest_irq, + .unmask = enable_lguest_irq, +}; + +/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware + * interrupt (except 128, which is used for system calls), and then tells the + * Linux infrastructure that each interrupt is controlled by our level-based + * lguest interrupt controller. */ +static void __init lguest_init_IRQ(void) +{ + unsigned int i; + + for (i = 0; i < LGUEST_IRQS; i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != SYSCALL_VECTOR) { + set_intr_gate(vector, interrupt[i]); + set_irq_chip_and_handler(i, &lguest_irq_controller, + handle_level_irq); + } + } + /* This call is required to set up for 4k stacks, where we have + * separate stacks for hard and soft interrupts. */ + irq_ctx_init(smp_processor_id()); +} + +/* + * Time. + * + * It would be far better for everyone if the Guest had its own clock, but + * until then the Host gives us the time on every interrupt. + */ +static unsigned long lguest_get_wallclock(void) +{ + return lguest_data.time.tv_sec; +} + +static cycle_t lguest_clock_read(void) +{ + unsigned long sec, nsec; + + /* If the Host tells the TSC speed, we can trust that. */ + if (lguest_data.tsc_khz) + return native_read_tsc(); + + /* If we can't use the TSC, we read the time value written by the Host. + * Since it's in two parts (seconds and nanoseconds), we risk reading + * it just as it's changing from 99 & 0.999999999 to 100 and 0, and + * getting 99 and 0. As Linux tends to come apart under the stress of + * time travel, we must be careful: */ + do { + /* First we read the seconds part. */ + sec = lguest_data.time.tv_sec; + /* This read memory barrier tells the compiler and the CPU that + * this can't be reordered: we have to complete the above + * before going on. */ + rmb(); + /* Now we read the nanoseconds part. */ + nsec = lguest_data.time.tv_nsec; + /* Make sure we've done that. */ + rmb(); + /* Now if the seconds part has changed, try again. */ + } while (unlikely(lguest_data.time.tv_sec != sec)); + + /* Our non-TSC clock is in real nanoseconds. */ + return sec*1000000000ULL + nsec; +} + +/* This is what we tell the kernel is our clocksource. */ +static struct clocksource lguest_clock = { + .name = "lguest", + .rating = 400, + .read = lguest_clock_read, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1 << 22, + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +/* The "scheduler clock" is just our real clock, adjusted to start at zero */ +static unsigned long long lguest_sched_clock(void) +{ + return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base); +} + +/* We also need a "struct clock_event_device": Linux asks us to set it to go + * off some time in the future. Actually, James Morris figured all this out, I + * just applied the patch. */ +static int lguest_clockevent_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + if (delta < LG_CLOCK_MIN_DELTA) { + if (printk_ratelimit()) + printk(KERN_DEBUG "%s: small delta %lu ns\n", + __FUNCTION__, delta); + return -ETIME; + } + hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); + return 0; +} + +static void lguest_clockevent_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + /* A 0 argument shuts the clock down. */ + hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0); + break; + case CLOCK_EVT_MODE_ONESHOT: + /* This is what we expect. */ + break; + case CLOCK_EVT_MODE_PERIODIC: + BUG(); + case CLOCK_EVT_MODE_RESUME: + break; + } +} + +/* This describes our primitive timer chip. */ +static struct clock_event_device lguest_clockevent = { + .name = "lguest", + .features = CLOCK_EVT_FEAT_ONESHOT, + .set_next_event = lguest_clockevent_set_next_event, + .set_mode = lguest_clockevent_set_mode, + .rating = INT_MAX, + .mult = 1, + .shift = 0, + .min_delta_ns = LG_CLOCK_MIN_DELTA, + .max_delta_ns = LG_CLOCK_MAX_DELTA, +}; + +/* This is the Guest timer interrupt handler (hardware interrupt 0). We just + * call the clockevent infrastructure and it does whatever needs doing. */ +static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) +{ + unsigned long flags; + + /* Don't interrupt us while this is running. */ + local_irq_save(flags); + lguest_clockevent.event_handler(&lguest_clockevent); + local_irq_restore(flags); +} + +/* At some point in the boot process, we get asked to set up our timing + * infrastructure. The kernel doesn't expect timer interrupts before this, but + * we cleverly initialized the "blocked_interrupts" field of "struct + * lguest_data" so that timer interrupts were blocked until now. */ +static void lguest_time_init(void) +{ + /* Set up the timer interrupt (0) to go to our simple timer routine */ + set_irq_handler(0, lguest_time_irq); + + /* Our clock structure look like arch/i386/kernel/tsc.c if we can use + * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either + * way, the "rating" is initialized so high that it's always chosen + * over any other clocksource. */ + if (lguest_data.tsc_khz) + lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, + lguest_clock.shift); + clock_base = lguest_clock_read(); + clocksource_register(&lguest_clock); + + /* Now we've set up our clock, we can use it as the scheduler clock */ + pv_time_ops.sched_clock = lguest_sched_clock; + + /* We can't set cpumask in the initializer: damn C limitations! Set it + * here and register our timer device. */ + lguest_clockevent.cpumask = cpumask_of_cpu(0); + clockevents_register_device(&lguest_clockevent); + + /* Finally, we unblock the timer interrupt. */ + enable_lguest_irq(0); +} + +/* + * Miscellaneous bits and pieces. + * + * Here is an oddball collection of functions which the Guest needs for things + * to work. They're pretty simple. + */ + +/* The Guest needs to tell the host what stack it expects traps to use. For + * native hardware, this is part of the Task State Segment mentioned above in + * lguest_load_tr_desc(), but to help hypervisors there's this special call. + * + * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data + * segment), the privilege level (we're privilege level 1, the Host is 0 and + * will not tolerate us trying to use that), the stack pointer, and the number + * of pages in the stack. */ +static void lguest_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, + THREAD_SIZE/PAGE_SIZE); +} + +/* Let's just say, I wouldn't do debugging under a Guest. */ +static void lguest_set_debugreg(int regno, unsigned long value) +{ + /* FIXME: Implement */ +} + +/* There are times when the kernel wants to make sure that no memory writes are + * caught in the cache (that they've all reached real hardware devices). This + * doesn't matter for the Guest which has virtual hardware. + * + * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush + * (clflush) instruction is available and the kernel uses that. Otherwise, it + * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. + * Unlike clflush, wbinvd can only be run at privilege level 0. So we can + * ignore clflush, but replace wbinvd. + */ +static void lguest_wbinvd(void) +{ +} + +/* If the Guest expects to have an Advanced Programmable Interrupt Controller, + * we play dumb by ignoring writes and returning 0 for reads. So it's no + * longer Programmable nor Controlling anything, and I don't think 8 lines of + * code qualifies for Advanced. It will also never interrupt anything. It + * does, however, allow us to get through the Linux boot code. */ +#ifdef CONFIG_X86_LOCAL_APIC +static void lguest_apic_write(unsigned long reg, unsigned long v) +{ +} + +static unsigned long lguest_apic_read(unsigned long reg) +{ + return 0; +} +#endif + +/* STOP! Until an interrupt comes in. */ +static void lguest_safe_halt(void) +{ + hcall(LHCALL_HALT, 0, 0, 0); +} + +/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a + * message out when we're crashing as well as elegant termination like powering + * off. + * + * Note that the Host always prefers that the Guest speak in physical addresses + * rather than virtual addresses, so we use __pa() here. */ +static void lguest_power_off(void) +{ + hcall(LHCALL_CRASH, __pa("Power down"), 0, 0); +} + +/* + * Panicing. + * + * Don't. But if you did, this is what happens. + */ +static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) +{ + hcall(LHCALL_CRASH, __pa(p), 0, 0); + /* The hcall won't return, but to keep gcc happy, we're "done". */ + return NOTIFY_DONE; +} + +static struct notifier_block paniced = { + .notifier_call = lguest_panic +}; + +/* Setting up memory is fairly easy. */ +static __init char *lguest_memory_setup(void) +{ + /* We do this here and not earlier because lockcheck barfs if we do it + * before start_kernel() */ + atomic_notifier_chain_register(&panic_notifier_list, &paniced); + + /* The Linux bootloader header contains an "e820" memory map: the + * Launcher populated the first entry with our memory limit. */ + add_memory_region(boot_params.e820_map[0].addr, + boot_params.e820_map[0].size, + boot_params.e820_map[0].type); + + /* This string is for the boot messages. */ + return "LGUEST"; +} + +/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to + * produce console output. */ +static __init int early_put_chars(u32 vtermno, const char *buf, int count) +{ + char scratch[17]; + unsigned int len = count; + + if (len > sizeof(scratch) - 1) + len = sizeof(scratch) - 1; + scratch[len] = '\0'; + memcpy(scratch, buf, len); + hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0); + + /* This routine returns the number of bytes actually written. */ + return len; +} + +/*G:050 + * Patching (Powerfully Placating Performance Pedants) + * + * We have already seen that pv_ops structures let us replace simple + * native instructions with calls to the appropriate back end all throughout + * the kernel. This allows the same kernel to run as a Guest and as a native + * kernel, but it's slow because of all the indirect branches. + * + * Remember that David Wheeler quote about "Any problem in computer science can + * be solved with another layer of indirection"? The rest of that quote is + * "... But that usually will create another problem." This is the first of + * those problems. + * + * Our current solution is to allow the paravirt back end to optionally patch + * over the indirect calls to replace them with something more efficient. We + * patch the four most commonly called functions: disable interrupts, enable + * interrupts, restore interrupts and save interrupts. We usually have 10 + * bytes to patch into: the Guest versions of these operations are small enough + * that we can fit comfortably. + * + * First we need assembly templates of each of the patchable Guest operations, + * and these are in lguest_asm.S. */ + +/*G:060 We construct a table from the assembler templates: */ +static const struct lguest_insns +{ + const char *start, *end; +} lguest_insns[] = { + [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, + [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, + [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, + [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, +}; + +/* Now our patch routine is fairly simple (based on the native one in + * paravirt.c). If we have a replacement, we copy it in and return how much of + * the available space we used. */ +static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, + unsigned long addr, unsigned len) +{ + unsigned int insn_len; + + /* Don't do anything special if we don't have a replacement */ + if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) + return paravirt_patch_default(type, clobber, ibuf, addr, len); + + insn_len = lguest_insns[type].end - lguest_insns[type].start; + + /* Similarly if we can't fit replacement (shouldn't happen, but let's + * be thorough). */ + if (len < insn_len) + return paravirt_patch_default(type, clobber, ibuf, addr, len); + + /* Copy in our instructions. */ + memcpy(ibuf, lguest_insns[type].start, insn_len); + return insn_len; +} + +/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops + * structures in the kernel provide points for (almost) every routine we have + * to override to avoid privileged instructions. */ +__init void lguest_init(void) +{ + /* We're under lguest, paravirt is enabled, and we're running at + * privilege level 1, not 0 as normal. */ + pv_info.name = "lguest"; + pv_info.paravirt_enabled = 1; + pv_info.kernel_rpl = 1; + + /* We set up all the lguest overrides for sensitive operations. These + * are detailed with the operations themselves. */ + + /* interrupt-related operations */ + pv_irq_ops.init_IRQ = lguest_init_IRQ; + pv_irq_ops.save_fl = save_fl; + pv_irq_ops.restore_fl = restore_fl; + pv_irq_ops.irq_disable = irq_disable; + pv_irq_ops.irq_enable = irq_enable; + pv_irq_ops.safe_halt = lguest_safe_halt; + + /* init-time operations */ + pv_init_ops.memory_setup = lguest_memory_setup; + pv_init_ops.patch = lguest_patch; + + /* Intercepts of various cpu instructions */ + pv_cpu_ops.load_gdt = lguest_load_gdt; + pv_cpu_ops.cpuid = lguest_cpuid; + pv_cpu_ops.load_idt = lguest_load_idt; + pv_cpu_ops.iret = lguest_iret; + pv_cpu_ops.load_esp0 = lguest_load_esp0; + pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; + pv_cpu_ops.set_ldt = lguest_set_ldt; + pv_cpu_ops.load_tls = lguest_load_tls; + pv_cpu_ops.set_debugreg = lguest_set_debugreg; + pv_cpu_ops.clts = lguest_clts; + pv_cpu_ops.read_cr0 = lguest_read_cr0; + pv_cpu_ops.write_cr0 = lguest_write_cr0; + pv_cpu_ops.read_cr4 = lguest_read_cr4; + pv_cpu_ops.write_cr4 = lguest_write_cr4; + pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; + pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; + pv_cpu_ops.wbinvd = lguest_wbinvd; + pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; + pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + + /* pagetable management */ + pv_mmu_ops.write_cr3 = lguest_write_cr3; + pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; + pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; + pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; + pv_mmu_ops.set_pte = lguest_set_pte; + pv_mmu_ops.set_pte_at = lguest_set_pte_at; + pv_mmu_ops.set_pmd = lguest_set_pmd; + pv_mmu_ops.read_cr2 = lguest_read_cr2; + pv_mmu_ops.read_cr3 = lguest_read_cr3; + pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; + pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + +#ifdef CONFIG_X86_LOCAL_APIC + /* apic read/write intercepts */ + pv_apic_ops.apic_write = lguest_apic_write; + pv_apic_ops.apic_write_atomic = lguest_apic_write; + pv_apic_ops.apic_read = lguest_apic_read; +#endif + + /* time operations */ + pv_time_ops.get_wallclock = lguest_get_wallclock; + pv_time_ops.time_init = lguest_time_init; + + /* Now is a good time to look at the implementations of these functions + * before returning to the rest of lguest_init(). */ + + /*G:070 Now we've seen all the paravirt_ops, we return to + * lguest_init() where the rest of the fairly chaotic boot setup + * occurs. */ + + /* The native boot code sets up initial page tables immediately after + * the kernel itself, and sets init_pg_tables_end so they're not + * clobbered. The Launcher places our initial pagetables somewhere at + * the top of our physical memory, so we don't need extra space: set + * init_pg_tables_end to the end of the kernel. */ + init_pg_tables_end = __pa(pg0); + + /* Load the %fs segment register (the per-cpu segment register) with + * the normal data segment to get through booting. */ + asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); + + /* The Host uses the top of the Guest's virtual address space for the + * Host<->Guest Switcher, and it tells us how much it needs in + * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ + reserve_top_address(lguest_data.reserve_mem); + + /* If we don't initialize the lock dependency checker now, it crashes + * paravirt_disable_iospace. */ + lockdep_init(); + + /* The IDE code spends about 3 seconds probing for disks: if we reserve + * all the I/O ports up front it can't get them and so doesn't probe. + * Other device drivers are similar (but less severe). This cuts the + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ + paravirt_disable_iospace(); + + /* This is messy CPU setup stuff which the native boot code does before + * start_kernel, so we have to do, too: */ + cpu_detect(&new_cpu_data); + /* head.S usually sets up the first capability word, so do it here. */ + new_cpu_data.x86_capability[0] = cpuid_edx(1); + + /* Math is always hard! */ + new_cpu_data.hard_math = 1; + +#ifdef CONFIG_X86_MCE + mce_disabled = 1; +#endif +#ifdef CONFIG_ACPI + acpi_disabled = 1; + acpi_ht = 0; +#endif + + /* We set the perferred console to "hvc". This is the "hypervisor + * virtual console" driver written by the PowerPC people, which we also + * adapted for lguest's use. */ + add_preferred_console("hvc", 0, NULL); + + /* Register our very early console. */ + virtio_cons_early_init(early_put_chars); + + /* Last of all, we set the power management poweroff hook to point to + * the Guest routine to power off. */ + pm_power_off = lguest_power_off; + + /* Now we're set up, call start_kernel() in init/main.c and we proceed + * to boot as normal. It never returns. */ + start_kernel(); +} +/* + * This marks the end of stage II of our journey, The Guest. + * + * It is now time for us to explore the nooks and crannies of the three Guest + * devices and complete our understanding of the Guest in "make Drivers". + */ diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S new file mode 100644 index 000000000000..ebc6ac733899 --- /dev/null +++ b/arch/x86/lguest/i386_head.S @@ -0,0 +1,115 @@ +#include <linux/linkage.h> +#include <linux/lguest.h> +#include <asm/lguest_hcall.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/processor-flags.h> + +/*G:020 This is where we begin: head.S notes that the boot header's platform + * type field is "1" (lguest), so calls us here. The boot header is in %esi. + * + * WARNING: be very careful here! We're running at addresses equal to physical + * addesses (around 0), not above PAGE_OFFSET as most code expectes + * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any + * data. + * + * The .section line puts this code in .init.text so it will be discarded after + * boot. */ +.section .init.text, "ax", @progbits +ENTRY(lguest_entry) + /* Make initial hypercall now, so we can set up the pagetables. */ + movl $LHCALL_LGUEST_INIT, %eax + movl $lguest_data - __PAGE_OFFSET, %edx + int $LGUEST_TRAP_ENTRY + + /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl + * instruction uses %esi implicitly. */ + movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi + + /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. + * This means the first 128M of kernel memory will be mapped at + * PAGE_OFFSET where the kernel expects to run. This will get it far + * enough through boot to switch to its own pagetables. */ + movl $32, %ecx + movl %esi, %edi + addl $((__PAGE_OFFSET >> 22) * 4), %edi + rep + movsl + + /* Set up the initial stack so we can run C code. */ + movl $(init_thread_union+THREAD_SIZE),%esp + + /* Jumps are relative, and we're running __PAGE_OFFSET too low at the + * moment. */ + jmp lguest_init+__PAGE_OFFSET + +/*G:055 We create a macro which puts the assembler code between lgstart_ and + * lgend_ markers. These templates are put in the .text section: they can't be + * discarded after boot as we may need to patch modules, too. */ +.text +#define LGUEST_PATCH(name, insns...) \ + lgstart_##name: insns; lgend_##name:; \ + .globl lgstart_##name; .globl lgend_##name + +LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) +LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) +/*:*/ + +/* These demark the EIP range where host should never deliver interrupts. */ +.global lguest_noirq_start +.global lguest_noirq_end + +/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, + * it sets the eflags interrupt bit on the stack based on + * lguest_data.irq_enabled, so the Guest iret logic does the right thing when + * restoring it. However, when the Host sets the Guest up for direct traps, + * such as system calls, the processor is the one to push eflags onto the + * stack, and the interrupt bit will be 1 (in reality, interrupts are always + * enabled in the Guest). + * + * This turns out to be harmless: the only trap which should happen under Linux + * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc + * regions), which has to be reflected through the Host anyway. If another + * trap *does* go off when interrupts are disabled, the Guest will panic, and + * we'll never get to this iret! :*/ + +/*G:045 There is one final paravirt_op that the Guest implements, and glancing + * at it you can see why I left it to last. It's *cool*! It's in *assembler*! + * + * The "iret" instruction is used to return from an interrupt or trap. The + * stack looks like this: + * old address + * old code segment & privilege level + * old processor flags ("eflags") + * + * The "iret" instruction pops those values off the stack and restores them all + * at once. The only problem is that eflags includes the Interrupt Flag which + * the Guest can't change: the CPU will simply ignore it when we do an "iret". + * So we have to copy eflags from the stack to lguest_data.irq_enabled before + * we do the "iret". + * + * There are two problems with this: firstly, we need to use a register to do + * the copy and secondly, the whole thing needs to be atomic. The first + * problem is easy to solve: push %eax on the stack so we can use it, and then + * restore it at the end just before the real "iret". + * + * The second is harder: copying eflags to lguest_data.irq_enabled will turn + * interrupts on before we're finished, so we could be interrupted before we + * return to userspace or wherever. Our solution to this is to surround the + * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the + * Host that it is *never* to interrupt us there, even if interrupts seem to be + * enabled. */ +ENTRY(lguest_iret) + pushl %eax + movl 12(%esp), %eax +lguest_noirq_start: + /* Note the %ss: segment prefix here. Normal data accesses use the + * "ds" segment, but that will have already been restored for whatever + * we're returning to (such as userspace): we can't trust it. The %ss: + * prefix makes sure we use the stack segment, which is still valid. */ + movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled + popl %eax + iret +lguest_noirq_end: diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c index f6edb11364df..952e7a89c2ac 100644 --- a/arch/x86/lib/delay_32.c +++ b/arch/x86/lib/delay_32.c @@ -82,7 +82,7 @@ inline void __const_udelay(unsigned long xloops) __asm__("mull %0" :"=d" (xloops), "=&a" (d0) :"1" (xloops), "0" - (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); + (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); __delay(++xloops); } diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c index 2dbebd308347..0ebbfb9e7c7f 100644 --- a/arch/x86/lib/delay_64.c +++ b/arch/x86/lib/delay_64.c @@ -40,7 +40,8 @@ EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { - __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1); + __delay(((xloops * HZ * + cpu_data(raw_smp_processor_id()).loops_per_jiffy) >> 32) + 1); } EXPORT_SYMBOL(__const_udelay); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9f38b12b4af1..8bab2b2efaff 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -748,7 +748,7 @@ survive: retval = get_user_pages(current, current->mm, (unsigned long )to, 1, 1, 0, &pg, NULL); - if (retval == -ENOMEM && is_init(current)) { + if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); congestion_wait(WRITE, HZ/50); goto survive; diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 3f08010f3517..0c28a071824c 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -108,7 +108,7 @@ void __init time_init_hook(void) * mca_nmi_hook - hook into MCA specific NMI chain * * Description: - * The MCA (Microchannel Arcitecture) has an NMI chain for NMI sources + * The MCA (Microchannel Architecture) has an NMI chain for NMI sources * along the MCA bus. Use this to hook into that chain if you will need * it. **/ @@ -131,7 +131,7 @@ static __init int no_ipi_broadcast(char *str) return 1; } -__setup("no_ipi_broadcast", no_ipi_broadcast); +__setup("no_ipi_broadcast=", no_ipi_broadcast); static int __init print_ipi_mode(void) { diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 8685208d8512..1af0cc7648f0 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -1,5 +1,5 @@ /* - * Default generic APIC driver. This handles upto 8 CPUs. + * Default generic APIC driver. This handles up to 8 CPUs. */ #define APIC_DEFINITION 1 #include <linux/threads.h> diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 4121d1551800..f410d3cb5659 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -56,7 +56,7 @@ void __init generic_bigsmp_probe(void) /* * This routine is used to switch to bigsmp mode when * - There is no apic= option specified by the user - * - generic_apic_probe() has choosen apic_default as the sub_arch + * - generic_apic_probe() has chosen apic_default as the sub_arch * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index e4928aa6bdfb..361ac5107b33 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -36,8 +36,8 @@ static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR /* per CPU data structure (for /proc/cpuinfo et al), visible externally * indexed physically */ -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_data); +DEFINE_PER_CPU(cpuinfo_x86, cpu_info) __cacheline_aligned; +EXPORT_PER_CPU_SYMBOL(cpu_info); /* physical ID of the CPU used to boot the system */ unsigned char boot_cpu_id; @@ -389,7 +389,7 @@ find_smp_config(void) /* The boot CPU must be extended */ voyager_extended_vic_processors = 1<<boot_cpu_id; - /* initially, all of the first 8 cpu's can boot */ + /* initially, all of the first 8 CPUs can boot */ voyager_allowed_boot_processors = 0xff; /* set up everything for just this CPU, we can alter * this as we start the other CPUs later */ @@ -430,7 +430,7 @@ find_smp_config(void) void __init smp_store_cpu_info(int id) { - struct cpuinfo_x86 *c=&cpu_data[id]; + struct cpuinfo_x86 *c = &cpu_data(id); *c = boot_cpu_data; @@ -634,7 +634,7 @@ do_boot_cpu(__u8 cpu) cpu, smp_processor_id())); printk("CPU%d: ", cpu); - print_cpu_info(&cpu_data[cpu]); + print_cpu_info(&cpu_data(cpu)); wmb(); cpu_set(cpu, cpu_callout_map); cpu_set(cpu, cpu_present_map); @@ -683,7 +683,7 @@ smp_boot_cpus(void) */ smp_store_cpu_info(boot_cpu_id); printk("CPU%d: ", boot_cpu_id); - print_cpu_info(&cpu_data[boot_cpu_id]); + print_cpu_info(&cpu_data(boot_cpu_id)); if(is_cpu_quad()) { /* booting on a Quad CPU */ @@ -714,7 +714,7 @@ smp_boot_cpus(void) unsigned long bogosum = 0; for (i = 0; i < NR_CPUS; i++) if (cpu_isset(i, cpu_online_map)) - bogosum += cpu_data[i].loops_per_jiffy; + bogosum += cpu_data(i).loops_per_jiffy; printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", cpucount+1, bogosum/(500000/HZ), @@ -1010,7 +1010,7 @@ static struct call_data_struct * call_data; /* execute a thread on a new CPU. The function to be called must be * previously set up. This is used to schedule a function for - * execution on all CPU's - set up the function then broadcast a + * execution on all CPUs - set up the function then broadcast a * function_interrupt CPI to come here on each CPU */ static void smp_call_function_interrupt(void) @@ -1095,7 +1095,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask, * CPI here. We don't use this actually for counting so losing * ticks doesn't matter * - * FIXME: For those CPU's which actually have a local APIC, we could + * FIXME: For those CPUs which actually have a local APIC, we could * try to use it to trigger this interrupt instead of having to * broadcast the timer tick. Unfortunately, all my pentium DYADs have * no local APIC, so I can't do this @@ -1287,7 +1287,7 @@ smp_local_timer_interrupt(void) /* * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). + * grabs the appropriate locks (kernel lock/ irq lock). * * we might want to decouple profiling from the 'long path', * and do the profiling totally in assembly. @@ -1759,7 +1759,7 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors; if(cpus_addr(mask)[0] == 0) - /* can't have no cpu's to accept the interrupt -- extremely + /* can't have no CPUs to accept the interrupt -- extremely * bad things will happen */ return; @@ -1791,7 +1791,7 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) } /* this is magic, we now have the correct affinity maps, so * enable the interrupt. This will send an enable CPI to - * those cpu's who need to enable it in their local masks, + * those CPUs who need to enable it in their local masks, * causing them to correct for the new affinity . If the * interrupt is currently globally disabled, it will simply be * disabled again as it comes in (voyager lazy disable). If diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c index f9d595338159..50f9366c411e 100644 --- a/arch/x86/mach-voyager/voyager_thread.c +++ b/arch/x86/mach-voyager/voyager_thread.c @@ -64,7 +64,7 @@ check_from_kernel(void) { if(voyager_status.switch_off) { - /* FIXME: This should be configureable via proc */ + /* FIXME: This should be configurable via proc */ execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1"); } else if(voyager_status.power_fail) { VDEBUG(("Voyager daemon detected AC power failure\n")); diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c index 4de95a17a7d4..f14da2a53ece 100644 --- a/arch/x86/mm/boot_ioremap_32.c +++ b/arch/x86/mm/boot_ioremap_32.c @@ -10,7 +10,7 @@ /* * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE - * keeps that from happenning. If anyone has a better way, I'm listening. + * keeps that from happening. If anyone has a better way, I'm listening. * * boot_pte_t is defined only if this all works correctly */ diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 13893772cc48..fe608a45ffb6 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -273,7 +273,7 @@ unsigned long __init setup_memory(void) * When mapping a NUMA machine we allocate the node_mem_map arrays * from node local memory. They are then mapped directly into KVA * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundry between ZONE_NORMAL + * this space and use it to adjust the boundary between ZONE_NORMAL * and ZONE_HIGHMEM. */ find_max_pfn(); diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c index 6555c3d14371..503dfc05111b 100644 --- a/arch/x86/mm/fault_32.c +++ b/arch/x86/mm/fault_32.c @@ -354,7 +354,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an + * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user @@ -362,7 +362,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, * exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. + * the source reference check when there is a possibility of a deadlock. * Attempt to lock the address space, if we cannot we then validate the * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. @@ -471,8 +471,8 @@ bad_area_nosemaphore: printk_ratelimit()) { printk("%s%s[%d]: segfault at %08lx eip %08lx " "esp %08lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->eip, + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, regs->eip, regs->esp, error_code); } tsk->thread.cr2 = address; @@ -564,7 +564,8 @@ no_context: * it's allocated already. */ if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT)) { + && (page & _PAGE_PRESENT) + && !(page & _PAGE_PSE)) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)]; @@ -587,7 +588,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c index 5e0e54906c48..644b4f7ece10 100644 --- a/arch/x86/mm/fault_64.c +++ b/arch/x86/mm/fault_64.c @@ -169,7 +169,7 @@ void dump_pagetable(unsigned long address) pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; @@ -285,7 +285,6 @@ static int vmalloc_fault(unsigned long address) return 0; } -static int page_fault_trace; int show_unhandled_signals = 1; /* @@ -354,10 +353,6 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (likely(regs->eflags & X86_EFLAGS_IF)) local_irq_enable(); - if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); - if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); @@ -378,7 +373,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, again: /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an + * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user @@ -386,7 +381,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, * exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. + * the source reference check when there is a possibility of a deadlock. * Attempt to lock the address space, if we cannot we then validate the * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. @@ -488,7 +483,7 @@ bad_area_nosemaphore: if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { printk( - "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, tsk->pid, address, regs->rip, regs->rsp, error_code); @@ -554,7 +549,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); goto again; } @@ -621,10 +616,3 @@ void vmalloc_sync_all(void) BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); } - -static int __init enable_pagefaulttrace(char *str) -{ - page_fault_trace = 1; - return 1; -} -__setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 5eec5e56d07f..3d6926ba8995 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -612,7 +612,7 @@ void __init init_cpu_to_node(void) { int i; for (i = 0; i < NR_CPUS; i++) { - u8 apicid = x86_cpu_to_apicid[i]; + u8 apicid = x86_cpu_to_apicid_init[i]; if (apicid == BAD_APICID) continue; if (apicid_to_node[apicid] == NUMA_NO_NODE) diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c index 8a4f65bf956e..c40afbaaf93d 100644 --- a/arch/x86/mm/pageattr_64.c +++ b/arch/x86/mm/pageattr_64.c @@ -61,10 +61,10 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, return base; } -static void cache_flush_page(void *adr) +void clflush_cache_range(void *adr, int size) { int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) clflush(adr+i); } @@ -80,7 +80,7 @@ static void flush_kernel_map(void *arg) asm volatile("wbinvd" ::: "memory"); else list_for_each_entry(pg, l, lru) { void *adr = page_address(pg); - cache_flush_page(adr); + clflush_cache_range(adr, PAGE_SIZE); } __flush_tlb_all(); } @@ -230,9 +230,14 @@ void global_flush_tlb(void) struct page *pg, *next; struct list_head l; - down_read(&init_mm.mmap_sem); + /* + * Write-protect the semaphore, to exclude two contexts + * doing a list_replace_init() call in parallel and to + * exclude new additions to the deferred_pages list: + */ + down_write(&init_mm.mmap_sem); list_replace_init(&deferred_pages, &l); - up_read(&init_mm.mmap_sem); + up_write(&init_mm.mmap_sem); flush_map(&l); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 56089ccc3949..ea85172fc0cc 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -218,7 +218,7 @@ static inline int save_add_info(void) {return 0;} /* * Update nodes_add and decide if to include add are in the zone. * Both SPARSE and RESERVE need nodes_add infomation. - * This code supports one contigious hot add area per node. + * This code supports one contiguous hot add area per node. */ static int reserve_hotadd(int node, unsigned long start, unsigned long end) { diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index c049ce414f01..0ed046a187f7 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -13,25 +13,45 @@ #include <linux/mm.h> #include <asm/ptrace.h> #include <asm/uaccess.h> +#include <asm/stacktrace.h> -struct frame_head { - struct frame_head * ebp; - unsigned long ret; -} __attribute__((packed)); +static void backtrace_warning_symbol(void *data, char *msg, + unsigned long symbol) +{ + /* Ignore warnings */ +} -static struct frame_head * -dump_kernel_backtrace(struct frame_head * head) +static void backtrace_warning(void *data, char *msg) { - oprofile_add_trace(head->ret); + /* Ignore warnings */ +} - /* frame pointers should strictly progress back up the stack - * (towards higher addresses) */ - if (head >= head->ebp) - return NULL; +static int backtrace_stack(void *data, char *name) +{ + /* Yes, we want all stacks */ + return 0; +} + +static void backtrace_address(void *data, unsigned long addr) +{ + unsigned int *depth = data; - return head->ebp; + if ((*depth)--) + oprofile_add_trace(addr); } +static struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +struct frame_head { + struct frame_head *ebp; + unsigned long ret; +} __attribute__((packed)); + static struct frame_head * dump_user_backtrace(struct frame_head * head) { @@ -53,72 +73,16 @@ dump_user_backtrace(struct frame_head * head) return bufhead[0].ebp; } -/* - * | | /\ Higher addresses - * | | - * --------------- stack base (address of current_thread_info) - * | thread info | - * . . - * | stack | - * --------------- saved regs->ebp value if valid (frame_head address) - * . . - * --------------- saved regs->rsp value if x86_64 - * | | - * --------------- struct pt_regs * stored on stack if 32-bit - * | | - * . . - * | | - * --------------- %esp - * | | - * | | \/ Lower addresses - * - * Thus, regs (or regs->rsp for x86_64) <-> stack base restricts the - * valid(ish) ebp values. Note: (1) for x86_64, NMI and several other - * exceptions use special stacks, maintained by the interrupt stack table - * (IST). These stacks are set up in trap_init() in - * arch/x86_64/kernel/traps.c. Thus, for x86_64, regs now does not point - * to the kernel stack; instead, it points to some location on the NMI - * stack. On the other hand, regs->rsp is the stack pointer saved when the - * NMI occurred. (2) For 32-bit, regs->esp is not valid because the - * processor does not save %esp on the kernel stack when interrupts occur - * in the kernel mode. - */ -#ifdef CONFIG_FRAME_POINTER -static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs) -{ - unsigned long headaddr = (unsigned long)head; -#ifdef CONFIG_X86_64 - unsigned long stack = (unsigned long)regs->rsp; -#else - unsigned long stack = (unsigned long)regs; -#endif - unsigned long stack_base = (stack & ~(THREAD_SIZE - 1)) + THREAD_SIZE; - - return headaddr > stack && headaddr < stack_base; -} -#else -/* without fp, it's just junk */ -static int valid_kernel_stack(struct frame_head * head, struct pt_regs * regs) -{ - return 0; -} -#endif - - void x86_backtrace(struct pt_regs * const regs, unsigned int depth) { - struct frame_head *head; - -#ifdef CONFIG_X86_64 - head = (struct frame_head *)regs->rbp; -#else - head = (struct frame_head *)regs->ebp; -#endif + struct frame_head *head = (struct frame_head *)frame_pointer(regs); + unsigned long stack = stack_pointer(regs); if (!user_mode_vm(regs)) { - while (depth-- && valid_kernel_stack(head, regs)) - head = dump_kernel_backtrace(head); + if (depth) + dump_trace(NULL, regs, (unsigned long *)stack, + &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index abb1aa95b979..45b605fa71d0 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -29,7 +29,7 @@ struct op_msrs { struct pt_regs; /* The model vtable abstracts the differences between - * various x86 CPU model's perfctr support. + * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { unsigned int const num_counters; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index c52150fdf82b..88d8f5c0ecb5 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -169,7 +169,7 @@ void eisa_set_level_irq(unsigned int irq) } /* - * Common IRQ routing practice: nybbles in config space, + * Common IRQ routing practice: nibbles in config space, * offset by some magic constant. */ static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr) @@ -585,7 +585,7 @@ static __init int via_router_probe(struct irq_router *r, /* FIXME: We should move some of the quirk fixup stuff here */ /* - * work arounds for some buggy BIOSes + * workarounds for some buggy BIOSes */ if (device == PCI_DEVICE_ID_VIA_82C586_0) { switch(router->device) { diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 9df99e1885a4..fbfa55ce0d55 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -3,8 +3,9 @@ # config XEN - bool "Enable support for Xen hypervisor" - depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES + bool "Xen guest support" + select PARAVIRT + depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the |