From 57b165948fe0e1cca78bca1fe1e1ae9a38baedd2 Mon Sep 17 00:00:00 2001 From: Michael D Labriola Date: Wed, 1 Feb 2012 10:05:00 -0500 Subject: x86/reboot: Reduce to a single DMI table for reboot quirks This commit reduces the X86_32 reboot_dmi_table and the X86_64 pci_reboot_dmi_table into a single table with a single set of functions (e.g., only 1 call to core_initcall). The table entries that use set_bios_reboot are grouped together inside a #define CONFIG_X86_32 block. Note that there's a single entry that uses set_kbd_reboot, which used to be available only on X86_32. This commit moves that entry outside the X86_32 block because it seems it never should have been in there. There's multiple places in reboot.c that assume KBD is valid regardless of X86_32/X86_64. Signed-off-by: Michael D Labriola Cc: Matthew Garrett Link: http://lkml.kernel.org/n/tip-lv3aliubas2l3aenq8v3uklk@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 176 ++++++++++++++++++++++------------------------- 1 file changed, 83 insertions(+), 93 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d840e69a853c..e73973769076 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -150,6 +150,80 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; + +void machine_real_restart(unsigned int type) +{ + void *restart_va; + unsigned long restart_pa; + void (*restart_lowmem)(unsigned int); + u64 *lowmem_gdt; + + local_irq_disable(); + + /* Write zero to CMOS register number 0x0f, which the BIOS POST + routine will recognize as telling it to do a proper reboot. (Well + that's what this book in front of me says -- it may only apply to + the Phoenix BIOS though, it's not clear). At the same time, + disable NMIs by setting the top bit in the CMOS address register, + as we're about to do peculiar things to the CPU. I'm not sure if + `outb_p' is needed instead of just `outb'. Use it to be on the + safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + spin_lock(&rtc_lock); + CMOS_WRITE(0x00, 0x8f); + spin_unlock(&rtc_lock); + + /* + * Switch back to the initial page table. + */ + load_cr3(initial_page_table); + + /* Write 0x1234 to absolute memory location 0x472. The BIOS reads + this on booting to tell it to "Bypass memory test (also warm + boot)". This seems like a fairly standard thing that gets set by + REBOOT.COM programs, and the previous reset routine did this + too. */ + *((unsigned short *)0x472) = reboot_mode; + + /* Patch the GDT in the low memory trampoline */ + lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + + restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); + restart_pa = virt_to_phys(restart_va); + restart_lowmem = (void (*)(unsigned int))restart_pa; + + /* GDT[0]: GDT self-pointer */ + lowmem_gdt[0] = + (u64)(sizeof(machine_real_restart_gdt) - 1) + + ((u64)virt_to_phys(lowmem_gdt) << 16); + /* GDT[1]: 64K real mode code segment */ + lowmem_gdt[1] = + GDT_ENTRY(0x009b, restart_pa, 0xffff); + + /* Jump to the identity-mapped low memory code */ + restart_lowmem(type); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif + +#endif /* CONFIG_X86_32 */ + +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9) { + reboot_type = BOOT_CF9; + printk(KERN_INFO "%s series board detected. " + "Selecting PCI-method for reboots.\n", d->ident); + } + return 0; +} + static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { @@ -159,7 +233,11 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) return 0; } +/* This is a single dmi_table handling all reboot quirks. Note that + * REBOOT_BIOS is only available for 32bit + */ static struct dmi_system_id __initdata reboot_dmi_table[] = { +#ifdef CONFIG_X86_32 { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, .ident = "Dell E520", @@ -309,6 +387,8 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, +#endif /* CONFIG_X86_32 */ + { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, .ident = "Acer Aspire One A110", @@ -317,96 +397,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, - { } -}; - -static int __init reboot_init(void) -{ - /* Only do the DMI check if reboot_type hasn't been overridden - * on the command line - */ - if (reboot_default) { - dmi_check_system(reboot_dmi_table); - } - return 0; -} -core_initcall(reboot_init); - -extern const unsigned char machine_real_restart_asm[]; -extern const u64 machine_real_restart_gdt[3]; - -void machine_real_restart(unsigned int type) -{ - void *restart_va; - unsigned long restart_pa; - void (*restart_lowmem)(unsigned int); - u64 *lowmem_gdt; - - local_irq_disable(); - - /* Write zero to CMOS register number 0x0f, which the BIOS POST - routine will recognize as telling it to do a proper reboot. (Well - that's what this book in front of me says -- it may only apply to - the Phoenix BIOS though, it's not clear). At the same time, - disable NMIs by setting the top bit in the CMOS address register, - as we're about to do peculiar things to the CPU. I'm not sure if - `outb_p' is needed instead of just `outb'. Use it to be on the - safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) - */ - spin_lock(&rtc_lock); - CMOS_WRITE(0x00, 0x8f); - spin_unlock(&rtc_lock); - - /* - * Switch back to the initial page table. - */ - load_cr3(initial_page_table); - - /* Write 0x1234 to absolute memory location 0x472. The BIOS reads - this on booting to tell it to "Bypass memory test (also warm - boot)". This seems like a fairly standard thing that gets set by - REBOOT.COM programs, and the previous reset routine did this - too. */ - *((unsigned short *)0x472) = reboot_mode; - - /* Patch the GDT in the low memory trampoline */ - lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); - - restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); - restart_pa = virt_to_phys(restart_va); - restart_lowmem = (void (*)(unsigned int))restart_pa; - - /* GDT[0]: GDT self-pointer */ - lowmem_gdt[0] = - (u64)(sizeof(machine_real_restart_gdt) - 1) + - ((u64)virt_to_phys(lowmem_gdt) << 16); - /* GDT[1]: 64K real mode code segment */ - lowmem_gdt[1] = - GDT_ENTRY(0x009b, restart_pa, 0xffff); - - /* Jump to the identity-mapped low memory code */ - restart_lowmem(type); -} -#ifdef CONFIG_APM_MODULE -EXPORT_SYMBOL(machine_real_restart); -#endif - -#endif /* CONFIG_X86_32 */ - -/* - * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot - */ -static int __init set_pci_reboot(const struct dmi_system_id *d) -{ - if (reboot_type != BOOT_CF9) { - reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); - } - return 0; -} - -static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { { /* Handle problems with rebooting on Apple MacBook5 */ .callback = set_pci_reboot, .ident = "Apple MacBook5", @@ -474,17 +464,17 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { { } }; -static int __init pci_reboot_init(void) +static int __init reboot_init(void) { /* Only do the DMI check if reboot_type hasn't been overridden * on the command line */ if (reboot_default) { - dmi_check_system(pci_reboot_dmi_table); + dmi_check_system(reboot_dmi_table); } return 0; } -core_initcall(pci_reboot_init); +core_initcall(reboot_init); static inline void kb_wait(void) { -- cgit v1.2.3 From 144d102b926f887d3d9f909b69a5c4f504ae0d40 Mon Sep 17 00:00:00 2001 From: Michael D Labriola Date: Wed, 1 Feb 2012 10:06:34 -0500 Subject: x86/reboot: Clean up coding style This commit simply cleans up the style used in this file to fall in line with what's specified in CodingStyle. Mostly comment changes, with a single removal of unneeded braces. Note that the comments for all the DMI quirks in reboot_dmi_table now all line up consistently using tabs instead of spaces. Signed-off-by: Michael D Labriola Link: http://lkml.kernel.org/n/tip-lde9yy7qsomh0sdqevn7xp56@git.kernel.org [ Fixed a few small details. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 129 +++++++++++++++++++++++++---------------------- 1 file changed, 70 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e73973769076..77215c23fba1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,7 +39,8 @@ static int reboot_mode; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; -/* This variable is used privately to keep track of whether or not +/* + * This variable is used privately to keep track of whether or not * reboot_type is still set to its default value (i.e., reboot= hasn't * been set on the command line). This is needed so that we can * suppress DMI scanning for reboot quirks. Without it, it's @@ -51,7 +52,8 @@ static int reboot_default = 1; static int reboot_cpu = -1; #endif -/* This is set if we need to go through the 'emergency' path. +/* + * This is set if we need to go through the 'emergency' path. * When machine_emergency_restart() is called, we may be on * an inconsistent state and won't be able to do a clean cleanup */ @@ -60,22 +62,24 @@ static int reboot_emergency; /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ bool port_cf9_safe = false; -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - bios Reboot by jumping through the BIOS (only for X86_32) - smp Reboot by executing reset on BSP or other CPU (only for X86_32) - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - acpi Use the RESET_REG in the FADT - efi Use efi reset_system runtime service - pci Use the so-called "PCI reset register", CF9 - force Avoid anything that could hang. +/* + * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] + * warm Don't set the cold reboot flag + * cold Set the cold reboot flag + * bios Reboot by jumping through the BIOS (only for X86_32) + * smp Reboot by executing reset on BSP or other CPU (only for X86_32) + * triple Force a triple fault (init) + * kbd Use the keyboard controller. cold reset (default) + * acpi Use the RESET_REG in the FADT + * efi Use efi reset_system runtime service + * pci Use the so-called "PCI reset register", CF9 + * force Avoid anything that could hang. */ static int __init reboot_setup(char *str) { for (;;) { - /* Having anything passed on the command line via + /* + * Having anything passed on the command line via * reboot= will cause us to disable DMI checking * below. */ @@ -98,9 +102,11 @@ static int __init reboot_setup(char *str) if (isdigit(*(str+2))) reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); } - /* we will leave sorting out the final value - when we are ready to reboot, since we might not - have detected BSP APIC ID or smp_num_cpu */ + /* + * We will leave sorting out the final value + * when we are ready to reboot, since we might not + * have detected BSP APIC ID or smp_num_cpu + */ break; #endif /* CONFIG_SMP */ @@ -162,14 +168,15 @@ void machine_real_restart(unsigned int type) local_irq_disable(); - /* Write zero to CMOS register number 0x0f, which the BIOS POST - routine will recognize as telling it to do a proper reboot. (Well - that's what this book in front of me says -- it may only apply to - the Phoenix BIOS though, it's not clear). At the same time, - disable NMIs by setting the top bit in the CMOS address register, - as we're about to do peculiar things to the CPU. I'm not sure if - `outb_p' is needed instead of just `outb'. Use it to be on the - safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + /* + * Write zero to CMOS register number 0x0f, which the BIOS POST + * routine will recognize as telling it to do a proper reboot. (Well + * that's what this book in front of me says -- it may only apply to + * the Phoenix BIOS though, it's not clear). At the same time, + * disable NMIs by setting the top bit in the CMOS address register, + * as we're about to do peculiar things to the CPU. I'm not sure if + * `outb_p' is needed instead of just `outb'. Use it to be on the + * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) */ spin_lock(&rtc_lock); CMOS_WRITE(0x00, 0x8f); @@ -180,11 +187,12 @@ void machine_real_restart(unsigned int type) */ load_cr3(initial_page_table); - /* Write 0x1234 to absolute memory location 0x472. The BIOS reads - this on booting to tell it to "Bypass memory test (also warm - boot)". This seems like a fairly standard thing that gets set by - REBOOT.COM programs, and the previous reset routine did this - too. */ + /* + * Write 0x1234 to absolute memory location 0x472. The BIOS reads + * this on booting to tell it to "Bypass memory test (also warm + * boot)". This seems like a fairly standard thing that gets set by + * REBOOT.COM programs, and the previous reset routine did this + * too. */ *((unsigned short *)0x472) = reboot_mode; /* Patch the GDT in the low memory trampoline */ @@ -233,7 +241,8 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) return 0; } -/* This is a single dmi_table handling all reboot quirks. Note that +/* + * This is a single dmi_table handling all reboot quirks. Note that * REBOOT_BIOS is only available for 32bit */ static struct dmi_system_id __initdata reboot_dmi_table[] = { @@ -262,7 +271,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ + { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -270,7 +279,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/ + { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -279,7 +288,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0MM599"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 745", .matches = { @@ -288,7 +297,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KW626"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 330", .matches = { @@ -297,7 +306,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0KP561"), }, }, - { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 360", .matches = { @@ -306,7 +315,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, - { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ .callback = set_bios_reboot, .ident = "Dell OptiPlex 760", .matches = { @@ -379,7 +388,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, - { /* Handle problems with rebooting on ASUS P4S800 */ + { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, .ident = "ASUS P4S800", .matches = { @@ -389,7 +398,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, #endif /* CONFIG_X86_32 */ - { /* Handle reboot issue on Acer Aspire one */ + { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, .ident = "Acer Aspire One A110", .matches = { @@ -466,12 +475,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { static int __init reboot_init(void) { - /* Only do the DMI check if reboot_type hasn't been overridden + /* + * Only do the DMI check if reboot_type hasn't been overridden * on the command line */ - if (reboot_default) { + if (reboot_default) dmi_check_system(reboot_dmi_table); - } return 0; } core_initcall(reboot_init); @@ -492,14 +501,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs) cpu_emergency_vmxoff(); } -/* Use NMIs as IPIs to tell all CPUs to disable virtualization - */ +/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ static void emergency_vmx_disable_all(void) { /* Just make sure we won't change CPUs while doing this */ local_irq_disable(); - /* We need to disable VMX on all CPUs before rebooting, otherwise + /* + * We need to disable VMX on all CPUs before rebooting, otherwise * we risk hanging up the machine, because the CPU ignore INIT * signals when VMX is enabled. * @@ -518,8 +527,7 @@ static void emergency_vmx_disable_all(void) * is still enabling VMX. */ if (cpu_has_vmx() && cpu_vmx_enabled()) { - /* Disable VMX on this CPU. - */ + /* Disable VMX on this CPU. */ cpu_vmxoff(); /* Halt and disable VMX on the other CPUs */ @@ -564,12 +572,12 @@ static void native_machine_emergency_restart(void) /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { case BOOT_KBD: - mach_reboot_fixups(); /* for board specific fixups */ + mach_reboot_fixups(); /* For board specific fixups */ for (i = 0; i < 10; i++) { kb_wait(); udelay(50); - outb(0xfe, 0x64); /* pulse reset low */ + outb(0xfe, 0x64); /* Pulse reset low */ udelay(50); } if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { @@ -611,7 +619,7 @@ static void native_machine_emergency_restart(void) case BOOT_CF9: port_cf9_safe = true; - /* fall through */ + /* Fall through */ case BOOT_CF9_COND: if (port_cf9_safe) { @@ -649,7 +657,8 @@ void native_machine_shutdown(void) /* Make certain I only run on the appropriate processor */ set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); - /* O.K Now that I'm on the appropriate processor, + /* + * O.K Now that I'm on the appropriate processor, * stop all of the others. */ stop_other_cpus(); @@ -687,12 +696,11 @@ static void native_machine_restart(char *__unused) static void native_machine_halt(void) { - /* stop other cpus and apics */ + /* Stop other cpus and apics */ machine_shutdown(); tboot_shutdown(TB_SHUTDOWN_HALT); - /* stop this cpu */ stop_this_cpu(NULL); } @@ -703,7 +711,7 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } - /* a fallback in case there is no PM info available */ + /* A fallback in case there is no PM info available */ tboot_shutdown(TB_SHUTDOWN_HALT); } @@ -765,7 +773,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) cpu = raw_smp_processor_id(); - /* Don't do anything if this handler is invoked on crashing cpu. + /* + * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get * an NMI if system was initially booted with nmi_watchdog parameter. */ @@ -789,7 +798,8 @@ static void smp_send_nmi_allbutself(void) apic->send_IPI_allbutself(NMI_VECTOR); } -/* Halt all other CPUs, calling the specified function on each of them +/* + * Halt all other CPUs, calling the specified function on each of them * * This function can be used to halt all other CPUs on crash * or emergency reboot time. The function passed as parameter @@ -800,7 +810,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) unsigned long msecs; local_irq_disable(); - /* Make a note of crashing cpu. Will be used in NMI callback.*/ + /* Make a note of crashing cpu. Will be used in NMI callback. */ crashing_cpu = safe_smp_processor_id(); shootdown_callback = callback; @@ -809,8 +819,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Would it be better to replace the trap vector here? */ if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, NMI_FLAG_FIRST, "crash")) - return; /* return what? */ - /* Ensure the new callback function is set before sending + return; /* Return what? */ + /* + * Ensure the new callback function is set before sending * out the NMI */ wmb(); -- cgit v1.2.3 From 2b144498350860b6ee9dc57ff27a93ad488de5dc Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 9 Feb 2012 14:56:42 +0530 Subject: uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints Add uprobes support to the core kernel, with x86 support. This commit adds the kernel facilities, the actual uprobes user-space ABI and perf probe support comes in later commits. General design: Uprobes are maintained in an rb-tree indexed by inode and offset (the offset here is from the start of the mapping). For a unique (inode, offset) tuple, there can be at most one uprobe in the rb-tree. Since the (inode, offset) tuple identifies a unique uprobe, more than one user may be interested in the same uprobe. This provides the ability to connect multiple 'consumers' to the same uprobe. Each consumer defines a handler and a filter (optional). The 'handler' is run every time the uprobe is hit, if it matches the 'filter' criteria. The first consumer of a uprobe causes the breakpoint to be inserted at the specified address and subsequent consumers are appended to this list. On subsequent probes, the consumer gets appended to the existing list of consumers. The breakpoint is removed when the last consumer unregisters. For all other unregisterations, the consumer is removed from the list of consumers. Given a inode, we get a list of the mms that have mapped the inode. Do the actual registration if mm maps the page where a probe needs to be inserted/removed. We use a temporary list to walk through the vmas that map the inode. - The number of maps that map the inode, is not known before we walk the rmap and keeps changing. - extending vm_area_struct wasn't recommended, it's a size-critical data structure. - There can be more than one maps of the inode in the same mm. We add callbacks to the mmap methods to keep an eye on text vmas that are of interest to uprobes. When a vma of interest is mapped, we insert the breakpoint at the right address. Uprobe works by replacing the instruction at the address defined by (inode, offset) with the arch specific breakpoint instruction. We save a copy of the original instruction at the uprobed address. This is needed for: a. executing the instruction out-of-line (xol). b. instruction analysis for any subsequent fixups. c. restoring the instruction back when the uprobe is unregistered. We insert or delete a breakpoint instruction, and this breakpoint instruction is assumed to be the smallest instruction available on the platform. For fixed size instruction platforms this is trivially true, for variable size instruction platforms the breakpoint instruction is typically the smallest (often a single byte). Writing the instruction is done by COWing the page and changing the instruction during the copy, this even though most platforms allow atomic writes of the breakpoint instruction. This also mirrors the behaviour of a ptrace() memory write to a PRIVATE file map. The core worker is derived from KSM's replace_page() logic. In essence, similar to KSM: a. allocate a new page and copy over contents of the page that has the uprobed vaddr b. modify the copy and insert the breakpoint at the required address c. switch the original page with the copy containing the breakpoint d. flush page tables. replace_page() is being replicated here because of some minor changes in the type of pages and also because Hugh Dickins had plans to improve replace_page() for KSM specific work. Instruction analysis on x86 is based on instruction decoder and determines if an instruction can be probed and determines the necessary fixups after singlestep. Instruction analysis is done at probe insertion time so that we avoid having to repeat the same analysis every time a probe is hit. A lot of code here is due to the improvement/suggestions/inputs from Peter Zijlstra. Changelog: (v10): - Add code to clear REX.B prefix as suggested by Denys Vlasenko and Masami Hiramatsu. (v9): - Use insn_offset_modrm as suggested by Masami Hiramatsu. (v7): Handle comments from Peter Zijlstra: - Dont take reference to inode. (expect inode to uprobe_register to be sane). - Use PTR_ERR to set the return value. - No need to take reference to inode. - use PTR_ERR to return error value. - register and uprobe_unregister share code. (v5): - Modified del_consumer as per comments from Peter. - Drop reference to inode before dropping reference to uprobe. - Use i_size_read(inode) instead of inode->i_size. - Ensure uprobe->consumers is NULL, before __uprobe_unregister() is called. - Includes errno.h as recommended by Stephen Rothwell to fix a build issue on sparc defconfig - Remove restrictions while unregistering. - Earlier code leaked inode references under some conditions while registering/unregistering. - Continue the vma-rmap walk even if the intermediate vma doesnt meet the requirements. - Validate the vma found by find_vma before inserting/removing the breakpoint - Call del_consumer under mutex_lock. - Use hash locks. - Handle mremap. - Introduce find_least_offset_node() instead of close match logic in find_uprobe - Uprobes no more depends on MM_OWNER; No reference to task_structs while inserting/removing a probe. - Uses read_mapping_page instead of grab_cache_page so that the pages have valid content. - pass NULL to get_user_pages for the task parameter. - call SetPageUptodate on the new page allocated in write_opcode. - fix leaking a reference to the new page under certain conditions. - Include Instruction Decoder if Uprobes gets defined. - Remove const attributes for instruction prefix arrays. - Uses mm_context to know if the application is 32 bit. Signed-off-by: Srikar Dronamraju Also-written-by: Jim Keniston Reviewed-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Roland McGrath Cc: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Anton Arapov Cc: Ananth N Mavinakayanahalli Cc: Stephen Rothwell Cc: Denys Vlasenko Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Linux-mm Link: http://lkml.kernel.org/r/20120209092642.GE16600@linux.vnet.ibm.com [ Made various small edits to the commit log ] Signed-off-by: Ingo Molnar --- arch/Kconfig | 11 + arch/x86/Kconfig | 5 +- arch/x86/include/asm/uprobes.h | 42 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/uprobes.c | 412 +++++++++++++++++ include/linux/uprobes.h | 98 +++++ kernel/Makefile | 1 + kernel/uprobes.c | 976 +++++++++++++++++++++++++++++++++++++++++ mm/mmap.c | 23 + 9 files changed, 1568 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/uprobes.h create mode 100644 arch/x86/kernel/uprobes.c create mode 100644 include/linux/uprobes.h create mode 100644 kernel/uprobes.c (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..284f5898f526 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -65,6 +65,17 @@ config OPTPROBES depends on KPROBES && HAVE_OPTPROBES depends on !PREEMPT +config UPROBES + bool "User-space probes (EXPERIMENTAL)" + depends on ARCH_SUPPORTS_UPROBES + default n + help + Uprobes enables kernel subsystems to establish probepoints + in user applications and execute handler functions when + the probepoints are hit. + + If in doubt, say "N". + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..481dbfcf14ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -84,7 +84,7 @@ config X86 select GENERIC_IOMAP config INSTRUCTION_DECODER - def_bool (KPROBES || PERF_EVENTS) + def_bool (KPROBES || PERF_EVENTS || UPROBES) config OUTPUT_FORMAT string @@ -240,6 +240,9 @@ config ARCH_CPU_PROBE_RELEASE def_bool y depends on HOTPLUG_CPU +config ARCH_SUPPORTS_UPROBES + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h new file mode 100644 index 000000000000..8208234391ff --- /dev/null +++ b/arch/x86/include/asm/uprobes.h @@ -0,0 +1,42 @@ +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H +/* + * Userspace Probes (UProbes) for x86 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ + +typedef u8 uprobe_opcode_t; +#define MAX_UINSN_BYTES 16 +#define UPROBES_XOL_SLOT_BYTES 128 /* to keep it cache aligned */ + +#define UPROBES_BKPT_INSN 0xcc +#define UPROBES_BKPT_INSN_SIZE 1 + +struct uprobe_arch_info { + u16 fixups; +#ifdef CONFIG_X86_64 + unsigned long rip_rela_target_address; +#endif +}; + +struct uprobe; +extern int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe); +#endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369059c07a9..8c8c365a3bc3 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o +obj-$(CONFIG_UPROBES) += uprobes.o ### # 64 bit specific files diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c new file mode 100644 index 000000000000..2a301bb91bdb --- /dev/null +++ b/arch/x86/kernel/uprobes.c @@ -0,0 +1,412 @@ +/* + * Userspace Probes (UProbes) for x86 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ + +#include +#include +#include +#include + +#include +#include + +/* Post-execution fixups. */ + +/* No fixup needed */ +#define UPROBES_FIX_NONE 0x0 +/* Adjust IP back to vicinity of actual insn */ +#define UPROBES_FIX_IP 0x1 +/* Adjust the return address of a call insn */ +#define UPROBES_FIX_CALL 0x2 + +#define UPROBES_FIX_RIP_AX 0x8000 +#define UPROBES_FIX_RIP_CX 0x4000 + +/* Adaptations for mhiramat x86 decoder v14. */ +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) +#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + +#ifdef CONFIG_X86_64 +static volatile u32 good_insns_64[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#endif + +/* Good-instruction tables for 32-bit apps */ + +static volatile u32 good_insns_32[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +#undef W + +/* + * opcodes we'll probably never support: + * 6c-6d, e4-e5, ec-ed - in + * 6e-6f, e6-e7, ee-ef - out + * cc, cd - int3, int + * cf - iret + * d6 - illegal instruction + * f1 - int1/icebp + * f4 - hlt + * fa, fb - cli, sti + * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 + * + * invalid opcodes in 64-bit mode: + * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 + * + * 63 - we support this opcode in x86_64 but not in i386. + * + * opcodes we may need to refine support for: + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK + * + * others -- Do we need to support these? + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- + * but 64 and 65 (fs: and gs:) seem to be used, so we support them + * 67 - addr16 prefix + * ce - into + * f0 - lock prefix + */ + +/* + * TODO: + * - Where necessary, examine the modrm byte and allow only valid instructions + * in the different Groups and fpu instructions. + */ + +static bool is_prefix_bad(struct insn *insn) +{ + int i; + + for (i = 0; i < insn->prefixes.nbytes; i++) { + switch (insn->prefixes.bytes[i]) { + case 0x26: /*INAT_PFX_ES */ + case 0x2E: /*INAT_PFX_CS */ + case 0x36: /*INAT_PFX_DS */ + case 0x3E: /*INAT_PFX_SS */ + case 0xF0: /*INAT_PFX_LOCK */ + return true; + } + } + return false; +} + +static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn) +{ + insn_init(insn, uprobe->insn, false); + + /* Skip good instruction prefixes; reject "bad" ones. */ + insn_get_opcode(insn); + if (is_prefix_bad(insn)) + return -ENOTSUPP; + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) + return 0; + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + return -ENOTSUPP; +} + +/* + * Figure out which fixups post_xol() will need to perform, and annotate + * uprobe->arch_info.fixups accordingly. To start with, + * uprobe->arch_info.fixups is either zero or it reflects rip-related + * fixups. + */ +static void prepare_fixups(struct uprobe *uprobe, struct insn *insn) +{ + bool fix_ip = true, fix_call = false; /* defaults */ + int reg; + + insn_get_opcode(insn); /* should be a nop */ + + switch (OPCODE1(insn)) { + case 0xc3: /* ret/lret */ + case 0xcb: + case 0xc2: + case 0xca: + /* ip is correct */ + fix_ip = false; + break; + case 0xe8: /* call relative - Fix return addr */ + fix_call = true; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xff: + insn_get_modrm(insn); + reg = MODRM_REG(insn); + if (reg == 2 || reg == 3) { + /* call or lcall, indirect */ + /* Fix return addr; ip is correct. */ + fix_call = true; + fix_ip = false; + } else if (reg == 4 || reg == 5) { + /* jmp or ljmp, indirect */ + /* ip is correct. */ + fix_ip = false; + } + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + default: + break; + } + if (fix_ip) + uprobe->arch_info.fixups |= UPROBES_FIX_IP; + if (fix_call) + uprobe->arch_info.fixups |= UPROBES_FIX_CALL; +} + +#ifdef CONFIG_X86_64 +/* + * If uprobe->insn doesn't use rip-relative addressing, return + * immediately. Otherwise, rewrite the instruction so that it accesses + * its memory operand indirectly through a scratch register. Set + * uprobe->arch_info.fixups and uprobe->arch_info.rip_rela_target_address + * accordingly. (The contents of the scratch register will be saved + * before we single-step the modified instruction, and restored + * afterward.) + * + * We do this because a rip-relative instruction can access only a + * relatively small area (+/- 2 GB from the instruction), and the XOL + * area typically lies beyond that area. At least for instructions + * that store to memory, we can't execute the original instruction + * and "fix things up" later, because the misdirected store could be + * disastrous. + * + * Some useful facts about rip-relative instructions: + * - There's always a modrm byte. + * - There's never a SIB byte. + * - The displacement is always 4 bytes. + */ +static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, + struct insn *insn) +{ + u8 *cursor; + u8 reg; + + if (mm->context.ia32_compat) + return; + + uprobe->arch_info.rip_rela_target_address = 0x0; + if (!insn_rip_relative(insn)) + return; + + /* + * insn_rip_relative() would have decoded rex_prefix, modrm. + * Clear REX.b bit (extension of MODRM.rm field): + * we want to encode rax/rcx, not r8/r9. + */ + if (insn->rex_prefix.nbytes) { + cursor = uprobe->insn + insn_offset_rex_prefix(insn); + *cursor &= 0xfe; /* Clearing REX.B bit */ + } + + /* + * Point cursor at the modrm byte. The next 4 bytes are the + * displacement. Beyond the displacement, for some instructions, + * is the immediate operand. + */ + cursor = uprobe->insn + insn_offset_modrm(insn); + insn_get_length(insn); + + /* + * Convert from rip-relative addressing to indirect addressing + * via a scratch register. Change the r/m field from 0x5 (%rip) + * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + */ + reg = MODRM_REG(insn); + if (reg == 0) { + /* + * The register operand (if any) is either the A register + * (%rax, %eax, etc.) or (if the 0x4 bit is set in the + * REX prefix) %r8. In any case, we know the C register + * is NOT the register operand, so we use %rcx (register + * #1) for the scratch register. + */ + uprobe->arch_info.fixups = UPROBES_FIX_RIP_CX; + /* Change modrm from 00 000 101 to 00 000 001. */ + *cursor = 0x1; + } else { + /* Use %rax (register #0) for the scratch register. */ + uprobe->arch_info.fixups = UPROBES_FIX_RIP_AX; + /* Change modrm from 00 xxx 101 to 00 xxx 000 */ + *cursor = (reg << 3); + } + + /* Target address = address of next instruction + (signed) offset */ + uprobe->arch_info.rip_rela_target_address = (long)insn->length + + insn->displacement.value; + /* Displacement field is gone; slide immediate field (if any) over. */ + if (insn->immediate.nbytes) { + cursor++; + memmove(cursor, cursor + insn->displacement.nbytes, + insn->immediate.nbytes); + } + return; +} + +static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn) +{ + insn_init(insn, uprobe->insn, true); + + /* Skip good instruction prefixes; reject "bad" ones. */ + insn_get_opcode(insn); + if (is_prefix_bad(insn)) + return -ENOTSUPP; + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) + return 0; + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + return -ENOTSUPP; +} + +static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, + struct insn *insn) +{ + if (mm->context.ia32_compat) + return validate_insn_32bits(uprobe, insn); + return validate_insn_64bits(uprobe, insn); +} +#else +static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, + struct insn *insn) +{ + return; +} + +static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, + struct insn *insn) +{ + return validate_insn_32bits(uprobe, insn); +} +#endif /* CONFIG_X86_64 */ + +/** + * analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @uprobe: the probepoint information. + * Return 0 on success or a -ve number on error. + */ +int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe) +{ + int ret; + struct insn insn; + + uprobe->arch_info.fixups = 0; + ret = validate_insn_bits(mm, uprobe, &insn); + if (ret != 0) + return ret; + handle_riprel_insn(mm, uprobe, &insn); + prepare_fixups(uprobe, &insn); + return 0; +} diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h new file mode 100644 index 000000000000..f1d13fd140f2 --- /dev/null +++ b/include/linux/uprobes.h @@ -0,0 +1,98 @@ +#ifndef _LINUX_UPROBES_H +#define _LINUX_UPROBES_H +/* + * Userspace Probes (UProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ + +#include +#include + +struct vm_area_struct; +#ifdef CONFIG_ARCH_SUPPORTS_UPROBES +#include +#else + +typedef u8 uprobe_opcode_t; +struct uprobe_arch_info {}; + +#define MAX_UINSN_BYTES 4 +#endif + +#define uprobe_opcode_sz sizeof(uprobe_opcode_t) + +/* flags that denote/change uprobes behaviour */ +/* Have a copy of original instruction */ +#define UPROBES_COPY_INSN 0x1 +/* Dont run handlers when first register/ last unregister in progress*/ +#define UPROBES_RUN_HANDLER 0x2 + +struct uprobe_consumer { + int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + /* + * filter is optional; If a filter exists, handler is run + * if and only if filter returns true. + */ + bool (*filter)(struct uprobe_consumer *self, struct task_struct *task); + + struct uprobe_consumer *next; +}; + +struct uprobe { + struct rb_node rb_node; /* node in the rb tree */ + atomic_t ref; + struct rw_semaphore consumer_rwsem; + struct list_head pending_list; + struct uprobe_arch_info arch_info; + struct uprobe_consumer *consumers; + struct inode *inode; /* Also hold a ref to inode */ + loff_t offset; + int flags; + u8 insn[MAX_UINSN_BYTES]; +}; + +#ifdef CONFIG_UPROBES +extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, + unsigned long vaddr); +extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, + unsigned long vaddr, bool verify); +extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn); +extern int register_uprobe(struct inode *inode, loff_t offset, + struct uprobe_consumer *consumer); +extern void unregister_uprobe(struct inode *inode, loff_t offset, + struct uprobe_consumer *consumer); +extern int mmap_uprobe(struct vm_area_struct *vma); +#else /* CONFIG_UPROBES is not defined */ +static inline int register_uprobe(struct inode *inode, loff_t offset, + struct uprobe_consumer *consumer) +{ + return -ENOSYS; +} +static inline void unregister_uprobe(struct inode *inode, loff_t offset, + struct uprobe_consumer *consumer) +{ +} +static inline int mmap_uprobe(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_UPROBES */ +#endif /* _LINUX_UPROBES_H */ diff --git a/kernel/Makefile b/kernel/Makefile index 2d9de86b7e76..8609dd3d875a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_UPROBES) += uprobes.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/uprobes.c b/kernel/uprobes.c new file mode 100644 index 000000000000..72e8bb3b52cd --- /dev/null +++ b/kernel/uprobes.c @@ -0,0 +1,976 @@ +/* + * Userspace Probes (UProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ + +#include +#include +#include /* read_mapping_page */ +#include +#include +#include /* anon_vma_prepare */ +#include /* set_pte_at_notify */ +#include /* try_to_free_swap */ +#include + +static struct rb_root uprobes_tree = RB_ROOT; +static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ + +#define UPROBES_HASH_SZ 13 +/* serialize (un)register */ +static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; +#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) %\ + UPROBES_HASH_SZ]) + +/* serialize uprobe->pending_list */ +static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; +#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) %\ + UPROBES_HASH_SZ]) + +/* + * uprobe_events allows us to skip the mmap_uprobe if there are no uprobe + * events active at this time. Probably a fine grained per inode count is + * better? + */ +static atomic_t uprobe_events = ATOMIC_INIT(0); + +/* + * Maintain a temporary per vma info that can be used to search if a vma + * has already been handled. This structure is introduced since extending + * vm_area_struct wasnt recommended. + */ +struct vma_info { + struct list_head probe_list; + struct mm_struct *mm; + loff_t vaddr; +}; + +/* + * valid_vma: Verify if the specified vma is an executable vma + * Relax restrictions while unregistering: vm_flags might have + * changed after breakpoint was inserted. + * - is_register: indicates if we are in register context. + * - Return 1 if the specified virtual address is in an + * executable vma. + */ +static bool valid_vma(struct vm_area_struct *vma, bool is_register) +{ + if (!vma->vm_file) + return false; + + if (!is_register) + return true; + + if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == + (VM_READ|VM_EXEC)) + return true; + + return false; +} + +static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) +{ + loff_t vaddr; + + vaddr = vma->vm_start + offset; + vaddr -= vma->vm_pgoff << PAGE_SHIFT; + return vaddr; +} + +/** + * __replace_page - replace page in vma by new page. + * based on replace_page in mm/ksm.c + * + * @vma: vma that holds the pte pointing to page + * @page: the cowed page we are replacing by kpage + * @kpage: the modified page we replace page by + * + * Returns 0 on success, -EFAULT on failure. + */ +static int __replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!ptep) + goto out; + + get_page(kpage); + page_add_new_anon_rmap(kpage, vma, addr); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + + page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); + put_page(page); + pte_unmap_unlock(ptep, ptl); + err = 0; + +out: + return err; +} + +/** + * is_bkpt_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_bkpt_insn + * Returns true if @insn is a breakpoint instruction. + */ +bool __weak is_bkpt_insn(uprobe_opcode_t *insn) +{ + return (*insn == UPROBES_BKPT_INSN); +} + +/* + * NOTE: + * Expect the breakpoint instruction to be the smallest size instruction for + * the architecture. If an arch has variable length instruction and the + * breakpoint instruction is not of the smallest length instruction + * supported by that architecture then we need to modify read_opcode / + * write_opcode accordingly. This would never be a problem for archs that + * have fixed length instructions. + */ + +/* + * write_opcode - write the opcode at a given virtual address. + * @mm: the probed process address space. + * @uprobe: the breakpointing information. + * @vaddr: the virtual address to store the opcode. + * @opcode: opcode to be written at @vaddr. + * + * Called with mm->mmap_sem held (for read and with a reference to + * mm). + * + * For mm @mm, write the opcode at @vaddr. + * Return 0 (success) or a negative errno. + */ +static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe, + unsigned long vaddr, uprobe_opcode_t opcode) +{ + struct page *old_page, *new_page; + struct address_space *mapping; + void *vaddr_old, *vaddr_new; + struct vm_area_struct *vma; + loff_t addr; + int ret; + + /* Read the page with vaddr into memory */ + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); + if (ret <= 0) + return ret; + ret = -EINVAL; + + /* + * We are interested in text pages only. Our pages of interest + * should be mapped for read and execute only. We desist from + * adding probes in write mapped pages since the breakpoints + * might end up in the file copy. + */ + if (!valid_vma(vma, is_bkpt_insn(&opcode))) + goto put_out; + + mapping = uprobe->inode->i_mapping; + if (mapping != vma->vm_file->f_mapping) + goto put_out; + + addr = vma_address(vma, uprobe->offset); + if (vaddr != (unsigned long)addr) + goto put_out; + + ret = -ENOMEM; + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); + if (!new_page) + goto put_out; + + __SetPageUptodate(new_page); + + /* + * lock page will serialize against do_wp_page()'s + * PageAnon() handling + */ + lock_page(old_page); + /* copy the page now that we've got it stable */ + vaddr_old = kmap_atomic(old_page); + vaddr_new = kmap_atomic(new_page); + + memcpy(vaddr_new, vaddr_old, PAGE_SIZE); + /* poke the new insn in, ASSUMES we don't cross page boundary */ + vaddr &= ~PAGE_MASK; + BUG_ON(vaddr + uprobe_opcode_sz > PAGE_SIZE); + memcpy(vaddr_new + vaddr, &opcode, uprobe_opcode_sz); + + kunmap_atomic(vaddr_new); + kunmap_atomic(vaddr_old); + + ret = anon_vma_prepare(vma); + if (ret) + goto unlock_out; + + lock_page(new_page); + ret = __replace_page(vma, old_page, new_page); + unlock_page(new_page); + +unlock_out: + unlock_page(old_page); + page_cache_release(new_page); + +put_out: + put_page(old_page); /* we did a get_page in the beginning */ + return ret; +} + +/** + * read_opcode - read the opcode at a given virtual address. + * @mm: the probed process address space. + * @vaddr: the virtual address to read the opcode. + * @opcode: location to store the read opcode. + * + * Called with mm->mmap_sem held (for read and with a reference to + * mm. + * + * For mm @mm, read the opcode at @vaddr and store it in @opcode. + * Return 0 (success) or a negative errno. + */ +static int read_opcode(struct mm_struct *mm, unsigned long vaddr, + uprobe_opcode_t *opcode) +{ + struct page *page; + void *vaddr_new; + int ret; + + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + if (ret <= 0) + return ret; + + lock_page(page); + vaddr_new = kmap_atomic(page); + vaddr &= ~PAGE_MASK; + memcpy(opcode, vaddr_new + vaddr, uprobe_opcode_sz); + kunmap_atomic(vaddr_new); + unlock_page(page); + put_page(page); /* we did a get_user_pages in the beginning */ + return 0; +} + +static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) +{ + uprobe_opcode_t opcode; + int result = read_opcode(mm, vaddr, &opcode); + + if (result) + return result; + + if (is_bkpt_insn(&opcode)) + return 1; + + return 0; +} + +/** + * set_bkpt - store breakpoint at a given address. + * @mm: the probed process address space. + * @uprobe: the probepoint information. + * @vaddr: the virtual address to insert the opcode. + * + * For mm @mm, store the breakpoint instruction at @vaddr. + * Return 0 (success) or a negative errno. + */ +int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, + unsigned long vaddr) +{ + int result = is_bkpt_at_addr(mm, vaddr); + + if (result == 1) + return -EEXIST; + + if (result) + return result; + + return write_opcode(mm, uprobe, vaddr, UPROBES_BKPT_INSN); +} + +/** + * set_orig_insn - Restore the original instruction. + * @mm: the probed process address space. + * @uprobe: the probepoint information. + * @vaddr: the virtual address to insert the opcode. + * @verify: if true, verify existance of breakpoint instruction. + * + * For mm @mm, restore the original opcode (opcode) at @vaddr. + * Return 0 (success) or a negative errno. + */ +int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, + unsigned long vaddr, bool verify) +{ + if (verify) { + int result = is_bkpt_at_addr(mm, vaddr); + + if (!result) + return -EINVAL; + + if (result != 1) + return result; + } + return write_opcode(mm, uprobe, vaddr, + *(uprobe_opcode_t *)uprobe->insn); +} + +static int match_uprobe(struct uprobe *l, struct uprobe *r) +{ + if (l->inode < r->inode) + return -1; + if (l->inode > r->inode) + return 1; + else { + if (l->offset < r->offset) + return -1; + + if (l->offset > r->offset) + return 1; + } + + return 0; +} + +static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe u = { .inode = inode, .offset = offset }; + struct rb_node *n = uprobes_tree.rb_node; + struct uprobe *uprobe; + int match; + + while (n) { + uprobe = rb_entry(n, struct uprobe, rb_node); + match = match_uprobe(&u, uprobe); + if (!match) { + atomic_inc(&uprobe->ref); + return uprobe; + } + if (match < 0) + n = n->rb_left; + else + n = n->rb_right; + } + return NULL; +} + +/* + * Find a uprobe corresponding to a given inode:offset + * Acquires uprobes_treelock + */ +static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe *uprobe; + unsigned long flags; + + spin_lock_irqsave(&uprobes_treelock, flags); + uprobe = __find_uprobe(inode, offset); + spin_unlock_irqrestore(&uprobes_treelock, flags); + return uprobe; +} + +static struct uprobe *__insert_uprobe(struct uprobe *uprobe) +{ + struct rb_node **p = &uprobes_tree.rb_node; + struct rb_node *parent = NULL; + struct uprobe *u; + int match; + + while (*p) { + parent = *p; + u = rb_entry(parent, struct uprobe, rb_node); + match = match_uprobe(uprobe, u); + if (!match) { + atomic_inc(&u->ref); + return u; + } + + if (match < 0) + p = &parent->rb_left; + else + p = &parent->rb_right; + + } + u = NULL; + rb_link_node(&uprobe->rb_node, parent, p); + rb_insert_color(&uprobe->rb_node, &uprobes_tree); + /* get access + creation ref */ + atomic_set(&uprobe->ref, 2); + return u; +} + +/* + * Acquires uprobes_treelock. + * Matching uprobe already exists in rbtree; + * increment (access refcount) and return the matching uprobe. + * + * No matching uprobe; insert the uprobe in rb_tree; + * get a double refcount (access + creation) and return NULL. + */ +static struct uprobe *insert_uprobe(struct uprobe *uprobe) +{ + unsigned long flags; + struct uprobe *u; + + spin_lock_irqsave(&uprobes_treelock, flags); + u = __insert_uprobe(uprobe); + spin_unlock_irqrestore(&uprobes_treelock, flags); + return u; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + +static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) +{ + struct uprobe *uprobe, *cur_uprobe; + + uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); + if (!uprobe) + return NULL; + + uprobe->inode = igrab(inode); + uprobe->offset = offset; + init_rwsem(&uprobe->consumer_rwsem); + INIT_LIST_HEAD(&uprobe->pending_list); + + /* add to uprobes_tree, sorted on inode:offset */ + cur_uprobe = insert_uprobe(uprobe); + + /* a uprobe exists for this inode:offset combination */ + if (cur_uprobe) { + kfree(uprobe); + uprobe = cur_uprobe; + iput(inode); + } else + atomic_inc(&uprobe_events); + return uprobe; +} + +/* Returns the previous consumer */ +static struct uprobe_consumer *add_consumer(struct uprobe *uprobe, + struct uprobe_consumer *consumer) +{ + down_write(&uprobe->consumer_rwsem); + consumer->next = uprobe->consumers; + uprobe->consumers = consumer; + up_write(&uprobe->consumer_rwsem); + return consumer->next; +} + +/* + * For uprobe @uprobe, delete the consumer @consumer. + * Return true if the @consumer is deleted successfully + * or return false. + */ +static bool del_consumer(struct uprobe *uprobe, + struct uprobe_consumer *consumer) +{ + struct uprobe_consumer **con; + bool ret = false; + + down_write(&uprobe->consumer_rwsem); + for (con = &uprobe->consumers; *con; con = &(*con)->next) { + if (*con == consumer) { + *con = consumer->next; + ret = true; + break; + } + } + up_write(&uprobe->consumer_rwsem); + return ret; +} + +static int __copy_insn(struct address_space *mapping, + struct vm_area_struct *vma, char *insn, + unsigned long nbytes, unsigned long offset) +{ + struct file *filp = vma->vm_file; + struct page *page; + void *vaddr; + unsigned long off1; + unsigned long idx; + + if (!filp) + return -EINVAL; + + idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); + off1 = offset &= ~PAGE_MASK; + + /* + * Ensure that the page that has the original instruction is + * populated and in page-cache. + */ + page = read_mapping_page(mapping, idx, filp); + if (IS_ERR(page)) + return PTR_ERR(page); + + vaddr = kmap_atomic(page); + memcpy(insn, vaddr + off1, nbytes); + kunmap_atomic(vaddr); + page_cache_release(page); + return 0; +} + +static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, + unsigned long addr) +{ + struct address_space *mapping; + int bytes; + unsigned long nbytes; + + addr &= ~PAGE_MASK; + nbytes = PAGE_SIZE - addr; + mapping = uprobe->inode->i_mapping; + + /* Instruction at end of binary; copy only available bytes */ + if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) + bytes = uprobe->inode->i_size - uprobe->offset; + else + bytes = MAX_UINSN_BYTES; + + /* Instruction at the page-boundary; copy bytes in second page */ + if (nbytes < bytes) { + if (__copy_insn(mapping, vma, uprobe->insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes)) + return -ENOMEM; + + bytes = nbytes; + } + return __copy_insn(mapping, vma, uprobe->insn, bytes, uprobe->offset); +} + +static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, + struct vm_area_struct *vma, loff_t vaddr) +{ + unsigned long addr; + int ret; + + /* + * If probe is being deleted, unregister thread could be done with + * the vma-rmap-walk through. Adding a probe now can be fatal since + * nobody will be able to cleanup. Also we could be from fork or + * mremap path, where the probe might have already been inserted. + * Hence behave as if probe already existed. + */ + if (!uprobe->consumers) + return -EEXIST; + + addr = (unsigned long)vaddr; + if (!(uprobe->flags & UPROBES_COPY_INSN)) { + ret = copy_insn(uprobe, vma, addr); + if (ret) + return ret; + + if (is_bkpt_insn((uprobe_opcode_t *)uprobe->insn)) + return -EEXIST; + + ret = analyze_insn(mm, uprobe); + if (ret) + return ret; + + uprobe->flags |= UPROBES_COPY_INSN; + } + ret = set_bkpt(mm, uprobe, addr); + + return ret; +} + +static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, + loff_t vaddr) +{ + set_orig_insn(mm, uprobe, (unsigned long)vaddr, true); +} + +static void delete_uprobe(struct uprobe *uprobe) +{ + unsigned long flags; + + spin_lock_irqsave(&uprobes_treelock, flags); + rb_erase(&uprobe->rb_node, &uprobes_tree); + spin_unlock_irqrestore(&uprobes_treelock, flags); + iput(uprobe->inode); + put_uprobe(uprobe); + atomic_dec(&uprobe_events); +} + +static struct vma_info *__find_next_vma_info(struct list_head *head, + loff_t offset, struct address_space *mapping, + struct vma_info *vi, bool is_register) +{ + struct prio_tree_iter iter; + struct vm_area_struct *vma; + struct vma_info *tmpvi; + loff_t vaddr; + unsigned long pgoff = offset >> PAGE_SHIFT; + int existing_vma; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (!valid_vma(vma, is_register)) + continue; + + existing_vma = 0; + vaddr = vma_address(vma, offset); + list_for_each_entry(tmpvi, head, probe_list) { + if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { + existing_vma = 1; + break; + } + } + + /* + * Another vma needs a probe to be installed. However skip + * installing the probe if the vma is about to be unlinked. + */ + if (!existing_vma && + atomic_inc_not_zero(&vma->vm_mm->mm_users)) { + vi->mm = vma->vm_mm; + vi->vaddr = vaddr; + list_add(&vi->probe_list, head); + return vi; + } + } + return NULL; +} + +/* + * Iterate in the rmap prio tree and find a vma where a probe has not + * yet been inserted. + */ +static struct vma_info *find_next_vma_info(struct list_head *head, + loff_t offset, struct address_space *mapping, + bool is_register) +{ + struct vma_info *vi, *retvi; + vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); + if (!vi) + return ERR_PTR(-ENOMEM); + + mutex_lock(&mapping->i_mmap_mutex); + retvi = __find_next_vma_info(head, offset, mapping, vi, is_register); + mutex_unlock(&mapping->i_mmap_mutex); + + if (!retvi) + kfree(vi); + return retvi; +} + +static int register_for_each_vma(struct uprobe *uprobe, bool is_register) +{ + struct list_head try_list; + struct vm_area_struct *vma; + struct address_space *mapping; + struct vma_info *vi, *tmpvi; + struct mm_struct *mm; + loff_t vaddr; + int ret = 0; + + mapping = uprobe->inode->i_mapping; + INIT_LIST_HEAD(&try_list); + while ((vi = find_next_vma_info(&try_list, uprobe->offset, + mapping, is_register)) != NULL) { + if (IS_ERR(vi)) { + ret = PTR_ERR(vi); + break; + } + mm = vi->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, (unsigned long)vi->vaddr); + if (!vma || !valid_vma(vma, is_register)) { + list_del(&vi->probe_list); + kfree(vi); + up_read(&mm->mmap_sem); + mmput(mm); + continue; + } + vaddr = vma_address(vma, uprobe->offset); + if (vma->vm_file->f_mapping->host != uprobe->inode || + vaddr != vi->vaddr) { + list_del(&vi->probe_list); + kfree(vi); + up_read(&mm->mmap_sem); + mmput(mm); + continue; + } + + if (is_register) + ret = install_breakpoint(mm, uprobe, vma, vi->vaddr); + else + remove_breakpoint(mm, uprobe, vi->vaddr); + + up_read(&mm->mmap_sem); + mmput(mm); + if (is_register) { + if (ret && ret == -EEXIST) + ret = 0; + if (ret) + break; + } + } + list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { + list_del(&vi->probe_list); + kfree(vi); + } + return ret; +} + +static int __register_uprobe(struct uprobe *uprobe) +{ + return register_for_each_vma(uprobe, true); +} + +static void __unregister_uprobe(struct uprobe *uprobe) +{ + if (!register_for_each_vma(uprobe, false)) + delete_uprobe(uprobe); + + /* TODO : cant unregister? schedule a worker thread */ +} + +/* + * register_uprobe - register a probe + * @inode: the file in which the probe has to be placed. + * @offset: offset from the start of the file. + * @consumer: information on howto handle the probe.. + * + * Apart from the access refcount, register_uprobe() takes a creation + * refcount (thro alloc_uprobe) if and only if this @uprobe is getting + * inserted into the rbtree (i.e first consumer for a @inode:@offset + * tuple). Creation refcount stops unregister_uprobe from freeing the + * @uprobe even before the register operation is complete. Creation + * refcount is released when the last @consumer for the @uprobe + * unregisters. + * + * Return errno if it cannot successully install probes + * else return 0 (success) + */ +int register_uprobe(struct inode *inode, loff_t offset, + struct uprobe_consumer *consumer) +{ + struct uprobe *uprobe; + int ret = -EINVAL; + + if (!inode || !consumer || consumer->next) + return ret; + + if (offset > i_size_read(inode)) + return ret; + + ret = 0; + mutex_lock(uprobes_hash(inode)); + uprobe = alloc_uprobe(inode, offset); + if (uprobe && !add_consumer(uprobe, consumer)) { + ret = __register_uprobe(uprobe); + if (ret) { + uprobe->consumers = NULL; + __unregister_uprobe(uprobe); + } else + uprobe->flags |= UPROBES_RUN_HANDLER; + } + + mutex_unlock(uprobes_hash(inode)); + put_uprobe(uprobe); + + return ret; +} + +/* + * unregister_uprobe - unregister a already registered probe. + * @inode: the file in which the probe has to be removed. + * @offset: offset from the start of the file. + * @consumer: identify which probe if multiple probes are colocated. + */ +void unregister_uprobe(struct inode *inode, loff_t offset, + struct uprobe_consumer *consumer) +{ + struct uprobe *uprobe = NULL; + + if (!inode || !consumer) + return; + + uprobe = find_uprobe(inode, offset); + if (!uprobe) + return; + + mutex_lock(uprobes_hash(inode)); + if (!del_consumer(uprobe, consumer)) + goto unreg_out; + + if (!uprobe->consumers) { + __unregister_uprobe(uprobe); + uprobe->flags &= ~UPROBES_RUN_HANDLER; + } + +unreg_out: + mutex_unlock(uprobes_hash(inode)); + if (uprobe) + put_uprobe(uprobe); +} + +/* + * Of all the nodes that correspond to the given inode, return the node + * with the least offset. + */ +static struct rb_node *find_least_offset_node(struct inode *inode) +{ + struct uprobe u = { .inode = inode, .offset = 0}; + struct rb_node *n = uprobes_tree.rb_node; + struct rb_node *close_node = NULL; + struct uprobe *uprobe; + int match; + + while (n) { + uprobe = rb_entry(n, struct uprobe, rb_node); + match = match_uprobe(&u, uprobe); + if (uprobe->inode == inode) + close_node = n; + + if (!match) + return close_node; + + if (match < 0) + n = n->rb_left; + else + n = n->rb_right; + } + return close_node; +} + +/* + * For a given inode, build a list of probes that need to be inserted. + */ +static void build_probe_list(struct inode *inode, struct list_head *head) +{ + struct uprobe *uprobe; + struct rb_node *n; + unsigned long flags; + + spin_lock_irqsave(&uprobes_treelock, flags); + n = find_least_offset_node(inode); + for (; n; n = rb_next(n)) { + uprobe = rb_entry(n, struct uprobe, rb_node); + if (uprobe->inode != inode) + break; + + list_add(&uprobe->pending_list, head); + atomic_inc(&uprobe->ref); + } + spin_unlock_irqrestore(&uprobes_treelock, flags); +} + +/* + * Called from mmap_region. + * called with mm->mmap_sem acquired. + * + * Return -ve no if we fail to insert probes and we cannot + * bail-out. + * Return 0 otherwise. i.e : + * - successful insertion of probes + * - (or) no possible probes to be inserted. + * - (or) insertion of probes failed but we can bail-out. + */ +int mmap_uprobe(struct vm_area_struct *vma) +{ + struct list_head tmp_list; + struct uprobe *uprobe, *u; + struct inode *inode; + int ret = 0; + + if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) + return ret; /* Bail-out */ + + inode = vma->vm_file->f_mapping->host; + if (!inode) + return ret; + + INIT_LIST_HEAD(&tmp_list); + mutex_lock(uprobes_mmap_hash(inode)); + build_probe_list(inode, &tmp_list); + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + loff_t vaddr; + + list_del(&uprobe->pending_list); + if (!ret) { + vaddr = vma_address(vma, uprobe->offset); + if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { + put_uprobe(uprobe); + continue; + } + ret = install_breakpoint(vma->vm_mm, uprobe, vma, + vaddr); + if (ret == -EEXIST) + ret = 0; + } + put_uprobe(uprobe); + } + + mutex_unlock(uprobes_mmap_hash(inode)); + + return ret; +} + +static int __init init_uprobes(void) +{ + int i; + + for (i = 0; i < UPROBES_HASH_SZ; i++) { + mutex_init(&uprobes_mutex[i]); + mutex_init(&uprobes_mmap_mutex[i]); + } + return 0; +} + +static void __exit exit_uprobes(void) +{ +} + +module_init(init_uprobes); +module_exit(exit_uprobes); diff --git a/mm/mmap.c b/mm/mmap.c index 3f758c7f4c81..1aed183636d7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -616,6 +617,13 @@ again: remove_next = 1 + (end > next->vm_end); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); + if (root) { + mmap_uprobe(vma); + + if (adjust_next) + mmap_uprobe(next); + } + if (remove_next) { if (file) { fput(file); @@ -637,6 +645,8 @@ again: remove_next = 1 + (end > next->vm_end); goto again; } } + if (insert && file) + mmap_uprobe(insert); validate_mm(mm); @@ -1329,6 +1339,11 @@ out: mm->locked_vm += (len >> PAGE_SHIFT); } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); + + if (file && mmap_uprobe(vma)) + /* matching probes but cannot insert */ + goto unmap_and_free_vma; + return addr; unmap_and_free_vma: @@ -2285,6 +2300,10 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; + + if (vma->vm_file && mmap_uprobe(vma)) + return -EINVAL; + vma_link(mm, vma, prev, rb_link, rb_parent); return 0; } @@ -2354,6 +2373,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_pgoff = pgoff; if (new_vma->vm_file) { get_file(new_vma->vm_file); + + if (mmap_uprobe(new_vma)) + goto out_free_mempol; + if (vma->vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); } -- cgit v1.2.3 From 7b2d81d48a2d8e37efb6ce7b4d5ef58822b30d89 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Feb 2012 09:27:41 +0100 Subject: uprobes/core: Clean up, refactor and improve the code Make the uprobes code readable to me: - improve the Kconfig text so that a mere mortal gets some idea what CONFIG_UPROBES=y is really about - do trivial renames to standardize around the uprobes_*() namespace - clean up and simplify various code flow details - separate basic blocks of functionality - line break artifact and white space related removal - use standard local varible definition blocks - use vertical spacing to make things more readable - remove unnecessary volatile - restructure comment blocks to make them more uniform and more readable in general Cc: Srikar Dronamraju Cc: Jim Keniston Cc: Peter Zijlstra Cc: Oleg Nesterov Cc: Masami Hiramatsu Cc: Arnaldo Carvalho de Melo Cc: Anton Arapov Cc: Ananth N Mavinakayanahalli Link: http://lkml.kernel.org/n/tip-ewbwhb8o6navvllsauu7k07p@git.kernel.org Signed-off-by: Ingo Molnar --- arch/Kconfig | 14 ++- arch/x86/include/asm/uprobes.h | 17 ++-- arch/x86/kernel/uprobes.c | 129 ++++++++++++------------ include/linux/uprobes.h | 28 +++--- kernel/uprobes.c | 219 ++++++++++++++++++++++++----------------- mm/mmap.c | 12 +-- 6 files changed, 233 insertions(+), 186 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 284f5898f526..cca5b545d806 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -66,13 +66,19 @@ config OPTPROBES depends on !PREEMPT config UPROBES - bool "User-space probes (EXPERIMENTAL)" + bool "Transparent user-space probes (EXPERIMENTAL)" depends on ARCH_SUPPORTS_UPROBES default n help - Uprobes enables kernel subsystems to establish probepoints - in user applications and execute handler functions when - the probepoints are hit. + Uprobes is the user-space counterpart to kprobes: they + enable instrumentation applications (such as 'perf probe') + to establish unintrusive probes in user-space binaries and + libraries, by executing handler functions when the probes + are hit by user-space applications. + + ( These probes come in the form of single-byte breakpoints, + managed by the kernel and kept transparent to the probed + application. ) If in doubt, say "N". diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 8208234391ff..072df3902636 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -1,7 +1,7 @@ #ifndef _ASM_UPROBES_H #define _ASM_UPROBES_H /* - * Userspace Probes (UProbes) for x86 + * User-space Probes (UProbes) for x86 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -24,19 +24,20 @@ */ typedef u8 uprobe_opcode_t; -#define MAX_UINSN_BYTES 16 -#define UPROBES_XOL_SLOT_BYTES 128 /* to keep it cache aligned */ -#define UPROBES_BKPT_INSN 0xcc -#define UPROBES_BKPT_INSN_SIZE 1 +#define MAX_UINSN_BYTES 16 +#define UPROBES_XOL_SLOT_BYTES 128 /* to keep it cache aligned */ + +#define UPROBES_BKPT_INSN 0xcc +#define UPROBES_BKPT_INSN_SIZE 1 struct uprobe_arch_info { - u16 fixups; + u16 fixups; #ifdef CONFIG_X86_64 - unsigned long rip_rela_target_address; + unsigned long rip_rela_target_address; #endif }; struct uprobe; -extern int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe); +extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe); #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2a301bb91bdb..cf2a18498425 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1,5 +1,5 @@ /* - * Userspace Probes (UProbes) for x86 + * User-space Probes (UProbes) for x86 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,7 +20,6 @@ * Srikar Dronamraju * Jim Keniston */ - #include #include #include @@ -42,10 +41,10 @@ #define UPROBES_FIX_RIP_CX 0x4000 /* Adaptations for mhiramat x86 decoder v14. */ -#define OPCODE1(insn) ((insn)->opcode.bytes[0]) -#define OPCODE2(insn) ((insn)->opcode.bytes[1]) -#define OPCODE3(insn) ((insn)->opcode.bytes[2]) -#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) +#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -55,7 +54,7 @@ << (row % 32)) #ifdef CONFIG_X86_64 -static volatile u32 good_insns_64[256 / 32] = { +static u32 good_insns_64[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ @@ -81,7 +80,7 @@ static volatile u32 good_insns_64[256 / 32] = { /* Good-instruction tables for 32-bit apps */ -static volatile u32 good_insns_32[256 / 32] = { +static u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ @@ -105,7 +104,7 @@ static volatile u32 good_insns_32[256 / 32] = { }; /* Using this for both 64-bit and 32-bit apps */ -static volatile u32 good_2byte_insns[256 / 32] = { +static u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ @@ -132,42 +131,47 @@ static volatile u32 good_2byte_insns[256 / 32] = { /* * opcodes we'll probably never support: - * 6c-6d, e4-e5, ec-ed - in - * 6e-6f, e6-e7, ee-ef - out - * cc, cd - int3, int - * cf - iret - * d6 - illegal instruction - * f1 - int1/icebp - * f4 - hlt - * fa, fb - cli, sti - * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 + * + * 6c-6d, e4-e5, ec-ed - in + * 6e-6f, e6-e7, ee-ef - out + * cc, cd - int3, int + * cf - iret + * d6 - illegal instruction + * f1 - int1/icebp + * f4 - hlt + * fa, fb - cli, sti + * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 * * invalid opcodes in 64-bit mode: - * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 * - * 63 - we support this opcode in x86_64 but not in i386. + * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 + * 63 - we support this opcode in x86_64 but not in i386. * * opcodes we may need to refine support for: - * 0f - 2-byte instructions: For many of these instructions, the validity - * depends on the prefix and/or the reg field. On such instructions, we - * just consider the opcode combination valid if it corresponds to any - * valid instruction. - * 8f - Group 1 - only reg = 0 is OK - * c6-c7 - Group 11 - only reg = 0 is OK - * d9-df - fpu insns with some illegal encodings - * f2, f3 - repnz, repz prefixes. These are also the first byte for - * certain floating-point instructions, such as addsd. - * fe - Group 4 - only reg = 0 or 1 is OK - * ff - Group 5 - only reg = 0-6 is OK + * + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK * * others -- Do we need to support these? - * 0f - (floating-point?) prefetch instructions - * 07, 17, 1f - pop es, pop ss, pop ds - * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- + * + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- * but 64 and 65 (fs: and gs:) seem to be used, so we support them - * 67 - addr16 prefix - * ce - into - * f0 - lock prefix + * 67 - addr16 prefix + * ce - into + * f0 - lock prefix */ /* @@ -182,11 +186,11 @@ static bool is_prefix_bad(struct insn *insn) for (i = 0; i < insn->prefixes.nbytes; i++) { switch (insn->prefixes.bytes[i]) { - case 0x26: /*INAT_PFX_ES */ - case 0x2E: /*INAT_PFX_CS */ - case 0x36: /*INAT_PFX_DS */ - case 0x3E: /*INAT_PFX_SS */ - case 0xF0: /*INAT_PFX_LOCK */ + case 0x26: /* INAT_PFX_ES */ + case 0x2E: /* INAT_PFX_CS */ + case 0x36: /* INAT_PFX_DS */ + case 0x3E: /* INAT_PFX_SS */ + case 0xF0: /* INAT_PFX_LOCK */ return true; } } @@ -201,12 +205,15 @@ static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn) insn_get_opcode(insn); if (is_prefix_bad(insn)) return -ENOTSUPP; + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) return 0; + if (insn->opcode.nbytes == 2) { if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) return 0; } + return -ENOTSUPP; } @@ -282,12 +289,12 @@ static void prepare_fixups(struct uprobe *uprobe, struct insn *insn) * disastrous. * * Some useful facts about rip-relative instructions: - * - There's always a modrm byte. - * - There's never a SIB byte. - * - The displacement is always 4 bytes. + * + * - There's always a modrm byte. + * - There's never a SIB byte. + * - The displacement is always 4 bytes. */ -static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, - struct insn *insn) +static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) { u8 *cursor; u8 reg; @@ -342,13 +349,12 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, } /* Target address = address of next instruction + (signed) offset */ - uprobe->arch_info.rip_rela_target_address = (long)insn->length - + insn->displacement.value; + uprobe->arch_info.rip_rela_target_address = (long)insn->length + insn->displacement.value; + /* Displacement field is gone; slide immediate field (if any) over. */ if (insn->immediate.nbytes) { cursor++; - memmove(cursor, cursor + insn->displacement.nbytes, - insn->immediate.nbytes); + memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); } return; } @@ -361,8 +367,10 @@ static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn) insn_get_opcode(insn); if (is_prefix_bad(insn)) return -ENOTSUPP; + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) return 0; + if (insn->opcode.nbytes == 2) { if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) return 0; @@ -370,34 +378,31 @@ static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn) return -ENOTSUPP; } -static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, - struct insn *insn) +static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) { if (mm->context.ia32_compat) return validate_insn_32bits(uprobe, insn); return validate_insn_64bits(uprobe, insn); } -#else -static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, - struct insn *insn) +#else /* 32-bit: */ +static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) { - return; + /* No RIP-relative addressing on 32-bit */ } -static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, - struct insn *insn) +static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) { return validate_insn_32bits(uprobe, insn); } #endif /* CONFIG_X86_64 */ /** - * analyze_insn - instruction analysis including validity and fixups. + * arch_uprobes_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @uprobe: the probepoint information. * Return 0 on success or a -ve number on error. */ -int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe) +int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe) { int ret; struct insn insn; @@ -406,7 +411,9 @@ int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe) ret = validate_insn_bits(mm, uprobe, &insn); if (ret != 0) return ret; + handle_riprel_insn(mm, uprobe, &insn); prepare_fixups(uprobe, &insn); + return 0; } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f1d13fd140f2..64e45f116b2a 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -1,7 +1,7 @@ #ifndef _LINUX_UPROBES_H #define _LINUX_UPROBES_H /* - * Userspace Probes (UProbes) + * User-space Probes (UProbes) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,8 +40,10 @@ struct uprobe_arch_info {}; #define uprobe_opcode_sz sizeof(uprobe_opcode_t) /* flags that denote/change uprobes behaviour */ + /* Have a copy of original instruction */ #define UPROBES_COPY_INSN 0x1 + /* Dont run handlers when first register/ last unregister in progress*/ #define UPROBES_RUN_HANDLER 0x2 @@ -70,27 +72,23 @@ struct uprobe { }; #ifdef CONFIG_UPROBES -extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, - unsigned long vaddr); -extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, - unsigned long vaddr, bool verify); +extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr); +extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify); extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn); -extern int register_uprobe(struct inode *inode, loff_t offset, - struct uprobe_consumer *consumer); -extern void unregister_uprobe(struct inode *inode, loff_t offset, - struct uprobe_consumer *consumer); -extern int mmap_uprobe(struct vm_area_struct *vma); +extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer); +extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer); +extern int uprobe_mmap(struct vm_area_struct *vma); #else /* CONFIG_UPROBES is not defined */ -static inline int register_uprobe(struct inode *inode, loff_t offset, - struct uprobe_consumer *consumer) +static inline int +uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) { return -ENOSYS; } -static inline void unregister_uprobe(struct inode *inode, loff_t offset, - struct uprobe_consumer *consumer) +static inline void +uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) { } -static inline int mmap_uprobe(struct vm_area_struct *vma) +static inline int uprobe_mmap(struct vm_area_struct *vma) { return 0; } diff --git a/kernel/uprobes.c b/kernel/uprobes.c index 72e8bb3b52cd..884817f1b0d3 100644 --- a/kernel/uprobes.c +++ b/kernel/uprobes.c @@ -1,5 +1,5 @@ /* - * Userspace Probes (UProbes) + * User-space Probes (UProbes) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -29,24 +29,26 @@ #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ #include /* try_to_free_swap */ + #include static struct rb_root uprobes_tree = RB_ROOT; + static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 + /* serialize (un)register */ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; -#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) %\ - UPROBES_HASH_SZ]) + +#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; -#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) %\ - UPROBES_HASH_SZ]) +#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) /* - * uprobe_events allows us to skip the mmap_uprobe if there are no uprobe + * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe * events active at this time. Probably a fine grained per inode count is * better? */ @@ -58,9 +60,9 @@ static atomic_t uprobe_events = ATOMIC_INIT(0); * vm_area_struct wasnt recommended. */ struct vma_info { - struct list_head probe_list; - struct mm_struct *mm; - loff_t vaddr; + struct list_head probe_list; + struct mm_struct *mm; + loff_t vaddr; }; /* @@ -79,8 +81,7 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) if (!is_register) return true; - if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == - (VM_READ|VM_EXEC)) + if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) return true; return false; @@ -92,6 +93,7 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) vaddr = vma->vm_start + offset; vaddr -= vma->vm_pgoff << PAGE_SHIFT; + return vaddr; } @@ -105,8 +107,7 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) * * Returns 0 on success, -EFAULT on failure. */ -static int __replace_page(struct vm_area_struct *vma, struct page *page, - struct page *kpage) +static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -163,7 +164,7 @@ out: */ bool __weak is_bkpt_insn(uprobe_opcode_t *insn) { - return (*insn == UPROBES_BKPT_INSN); + return *insn == UPROBES_BKPT_INSN; } /* @@ -203,6 +204,7 @@ static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe, ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) return ret; + ret = -EINVAL; /* @@ -239,6 +241,7 @@ static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe, vaddr_new = kmap_atomic(new_page); memcpy(vaddr_new, vaddr_old, PAGE_SIZE); + /* poke the new insn in, ASSUMES we don't cross page boundary */ vaddr &= ~PAGE_MASK; BUG_ON(vaddr + uprobe_opcode_sz > PAGE_SIZE); @@ -260,7 +263,8 @@ unlock_out: page_cache_release(new_page); put_out: - put_page(old_page); /* we did a get_page in the beginning */ + put_page(old_page); + return ret; } @@ -276,8 +280,7 @@ put_out: * For mm @mm, read the opcode at @vaddr and store it in @opcode. * Return 0 (success) or a negative errno. */ -static int read_opcode(struct mm_struct *mm, unsigned long vaddr, - uprobe_opcode_t *opcode) +static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) { struct page *page; void *vaddr_new; @@ -293,15 +296,18 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, memcpy(opcode, vaddr_new + vaddr, uprobe_opcode_sz); kunmap_atomic(vaddr_new); unlock_page(page); - put_page(page); /* we did a get_user_pages in the beginning */ + + put_page(page); + return 0; } static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) { uprobe_opcode_t opcode; - int result = read_opcode(mm, vaddr, &opcode); + int result; + result = read_opcode(mm, vaddr, &opcode); if (result) return result; @@ -320,11 +326,11 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, - unsigned long vaddr) +int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr) { - int result = is_bkpt_at_addr(mm, vaddr); + int result; + result = is_bkpt_at_addr(mm, vaddr); if (result == 1) return -EEXIST; @@ -344,35 +350,35 @@ int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, - unsigned long vaddr, bool verify) +int __weak +set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify) { if (verify) { - int result = is_bkpt_at_addr(mm, vaddr); + int result; + result = is_bkpt_at_addr(mm, vaddr); if (!result) return -EINVAL; if (result != 1) return result; } - return write_opcode(mm, uprobe, vaddr, - *(uprobe_opcode_t *)uprobe->insn); + return write_opcode(mm, uprobe, vaddr, *(uprobe_opcode_t *)uprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) { if (l->inode < r->inode) return -1; + if (l->inode > r->inode) return 1; - else { - if (l->offset < r->offset) - return -1; - if (l->offset > r->offset) - return 1; - } + if (l->offset < r->offset) + return -1; + + if (l->offset > r->offset) + return 1; return 0; } @@ -391,6 +397,7 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) atomic_inc(&uprobe->ref); return uprobe; } + if (match < 0) n = n->rb_left; else @@ -411,6 +418,7 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) spin_lock_irqsave(&uprobes_treelock, flags); uprobe = __find_uprobe(inode, offset); spin_unlock_irqrestore(&uprobes_treelock, flags); + return uprobe; } @@ -436,16 +444,18 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) p = &parent->rb_right; } + u = NULL; rb_link_node(&uprobe->rb_node, parent, p); rb_insert_color(&uprobe->rb_node, &uprobes_tree); /* get access + creation ref */ atomic_set(&uprobe->ref, 2); + return u; } /* - * Acquires uprobes_treelock. + * Acquire uprobes_treelock. * Matching uprobe already exists in rbtree; * increment (access refcount) and return the matching uprobe. * @@ -460,6 +470,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) spin_lock_irqsave(&uprobes_treelock, flags); u = __insert_uprobe(uprobe); spin_unlock_irqrestore(&uprobes_treelock, flags); + return u; } @@ -490,19 +501,22 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) kfree(uprobe); uprobe = cur_uprobe; iput(inode); - } else + } else { atomic_inc(&uprobe_events); + } + return uprobe; } /* Returns the previous consumer */ -static struct uprobe_consumer *add_consumer(struct uprobe *uprobe, - struct uprobe_consumer *consumer) +static struct uprobe_consumer * +consumer_add(struct uprobe *uprobe, struct uprobe_consumer *consumer) { down_write(&uprobe->consumer_rwsem); consumer->next = uprobe->consumers; uprobe->consumers = consumer; up_write(&uprobe->consumer_rwsem); + return consumer->next; } @@ -511,8 +525,7 @@ static struct uprobe_consumer *add_consumer(struct uprobe *uprobe, * Return true if the @consumer is deleted successfully * or return false. */ -static bool del_consumer(struct uprobe *uprobe, - struct uprobe_consumer *consumer) +static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *consumer) { struct uprobe_consumer **con; bool ret = false; @@ -526,6 +539,7 @@ static bool del_consumer(struct uprobe *uprobe, } } up_write(&uprobe->consumer_rwsem); + return ret; } @@ -557,15 +571,15 @@ static int __copy_insn(struct address_space *mapping, memcpy(insn, vaddr + off1, nbytes); kunmap_atomic(vaddr); page_cache_release(page); + return 0; } -static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, - unsigned long addr) +static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping; - int bytes; unsigned long nbytes; + int bytes; addr &= ~PAGE_MASK; nbytes = PAGE_SIZE - addr; @@ -605,6 +619,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, return -EEXIST; addr = (unsigned long)vaddr; + if (!(uprobe->flags & UPROBES_COPY_INSN)) { ret = copy_insn(uprobe, vma, addr); if (ret) @@ -613,7 +628,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, if (is_bkpt_insn((uprobe_opcode_t *)uprobe->insn)) return -EEXIST; - ret = analyze_insn(mm, uprobe); + ret = arch_uprobes_analyze_insn(mm, uprobe); if (ret) return ret; @@ -624,8 +639,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, return ret; } -static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, - loff_t vaddr) +static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, loff_t vaddr) { set_orig_insn(mm, uprobe, (unsigned long)vaddr, true); } @@ -649,9 +663,11 @@ static struct vma_info *__find_next_vma_info(struct list_head *head, struct prio_tree_iter iter; struct vm_area_struct *vma; struct vma_info *tmpvi; - loff_t vaddr; - unsigned long pgoff = offset >> PAGE_SHIFT; + unsigned long pgoff; int existing_vma; + loff_t vaddr; + + pgoff = offset >> PAGE_SHIFT; vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) @@ -659,6 +675,7 @@ static struct vma_info *__find_next_vma_info(struct list_head *head, existing_vma = 0; vaddr = vma_address(vma, offset); + list_for_each_entry(tmpvi, head, probe_list) { if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { existing_vma = 1; @@ -670,14 +687,15 @@ static struct vma_info *__find_next_vma_info(struct list_head *head, * Another vma needs a probe to be installed. However skip * installing the probe if the vma is about to be unlinked. */ - if (!existing_vma && - atomic_inc_not_zero(&vma->vm_mm->mm_users)) { + if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { vi->mm = vma->vm_mm; vi->vaddr = vaddr; list_add(&vi->probe_list, head); + return vi; } } + return NULL; } @@ -685,11 +703,12 @@ static struct vma_info *__find_next_vma_info(struct list_head *head, * Iterate in the rmap prio tree and find a vma where a probe has not * yet been inserted. */ -static struct vma_info *find_next_vma_info(struct list_head *head, - loff_t offset, struct address_space *mapping, - bool is_register) +static struct vma_info * +find_next_vma_info(struct list_head *head, loff_t offset, struct address_space *mapping, + bool is_register) { struct vma_info *vi, *retvi; + vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); if (!vi) return ERR_PTR(-ENOMEM); @@ -700,6 +719,7 @@ static struct vma_info *find_next_vma_info(struct list_head *head, if (!retvi) kfree(vi); + return retvi; } @@ -711,16 +731,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) struct vma_info *vi, *tmpvi; struct mm_struct *mm; loff_t vaddr; - int ret = 0; + int ret; mapping = uprobe->inode->i_mapping; INIT_LIST_HEAD(&try_list); - while ((vi = find_next_vma_info(&try_list, uprobe->offset, - mapping, is_register)) != NULL) { + + ret = 0; + + for (;;) { + vi = find_next_vma_info(&try_list, uprobe->offset, mapping, is_register); + if (!vi) + break; + if (IS_ERR(vi)) { ret = PTR_ERR(vi); break; } + mm = vi->mm; down_read(&mm->mmap_sem); vma = find_vma(mm, (unsigned long)vi->vaddr); @@ -755,19 +782,21 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) break; } } + list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { list_del(&vi->probe_list); kfree(vi); } + return ret; } -static int __register_uprobe(struct uprobe *uprobe) +static int __uprobe_register(struct uprobe *uprobe) { return register_for_each_vma(uprobe, true); } -static void __unregister_uprobe(struct uprobe *uprobe) +static void __uprobe_unregister(struct uprobe *uprobe) { if (!register_for_each_vma(uprobe, false)) delete_uprobe(uprobe); @@ -776,15 +805,15 @@ static void __unregister_uprobe(struct uprobe *uprobe) } /* - * register_uprobe - register a probe + * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @consumer: information on howto handle the probe.. * - * Apart from the access refcount, register_uprobe() takes a creation + * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset - * tuple). Creation refcount stops unregister_uprobe from freeing the + * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @consumer for the @uprobe * unregisters. @@ -792,28 +821,29 @@ static void __unregister_uprobe(struct uprobe *uprobe) * Return errno if it cannot successully install probes * else return 0 (success) */ -int register_uprobe(struct inode *inode, loff_t offset, - struct uprobe_consumer *consumer) +int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) { struct uprobe *uprobe; - int ret = -EINVAL; + int ret; if (!inode || !consumer || consumer->next) - return ret; + return -EINVAL; if (offset > i_size_read(inode)) - return ret; + return -EINVAL; ret = 0; mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - if (uprobe && !add_consumer(uprobe, consumer)) { - ret = __register_uprobe(uprobe); + + if (uprobe && !consumer_add(uprobe, consumer)) { + ret = __uprobe_register(uprobe); if (ret) { uprobe->consumers = NULL; - __unregister_uprobe(uprobe); - } else + __uprobe_unregister(uprobe); + } else { uprobe->flags |= UPROBES_RUN_HANDLER; + } } mutex_unlock(uprobes_hash(inode)); @@ -823,15 +853,14 @@ int register_uprobe(struct inode *inode, loff_t offset, } /* - * unregister_uprobe - unregister a already registered probe. + * uprobe_unregister - unregister a already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. * @consumer: identify which probe if multiple probes are colocated. */ -void unregister_uprobe(struct inode *inode, loff_t offset, - struct uprobe_consumer *consumer) +void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) { - struct uprobe *uprobe = NULL; + struct uprobe *uprobe; if (!inode || !consumer) return; @@ -841,15 +870,14 @@ void unregister_uprobe(struct inode *inode, loff_t offset, return; mutex_lock(uprobes_hash(inode)); - if (!del_consumer(uprobe, consumer)) - goto unreg_out; - if (!uprobe->consumers) { - __unregister_uprobe(uprobe); - uprobe->flags &= ~UPROBES_RUN_HANDLER; + if (consumer_del(uprobe, consumer)) { + if (!uprobe->consumers) { + __uprobe_unregister(uprobe); + uprobe->flags &= ~UPROBES_RUN_HANDLER; + } } -unreg_out: mutex_unlock(uprobes_hash(inode)); if (uprobe) put_uprobe(uprobe); @@ -870,6 +898,7 @@ static struct rb_node *find_least_offset_node(struct inode *inode) while (n) { uprobe = rb_entry(n, struct uprobe, rb_node); match = match_uprobe(&u, uprobe); + if (uprobe->inode == inode) close_node = n; @@ -881,6 +910,7 @@ static struct rb_node *find_least_offset_node(struct inode *inode) else n = n->rb_right; } + return close_node; } @@ -890,11 +920,13 @@ static struct rb_node *find_least_offset_node(struct inode *inode) static void build_probe_list(struct inode *inode, struct list_head *head) { struct uprobe *uprobe; - struct rb_node *n; unsigned long flags; + struct rb_node *n; spin_lock_irqsave(&uprobes_treelock, flags); + n = find_least_offset_node(inode); + for (; n; n = rb_next(n)) { uprobe = rb_entry(n, struct uprobe, rb_node); if (uprobe->inode != inode) @@ -903,6 +935,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) list_add(&uprobe->pending_list, head); atomic_inc(&uprobe->ref); } + spin_unlock_irqrestore(&uprobes_treelock, flags); } @@ -912,42 +945,44 @@ static void build_probe_list(struct inode *inode, struct list_head *head) * * Return -ve no if we fail to insert probes and we cannot * bail-out. - * Return 0 otherwise. i.e : + * Return 0 otherwise. i.e: + * * - successful insertion of probes * - (or) no possible probes to be inserted. * - (or) insertion of probes failed but we can bail-out. */ -int mmap_uprobe(struct vm_area_struct *vma) +int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; - int ret = 0; + int ret; if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) - return ret; /* Bail-out */ + return 0; inode = vma->vm_file->f_mapping->host; if (!inode) - return ret; + return 0; INIT_LIST_HEAD(&tmp_list); mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, &tmp_list); + + ret = 0; + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { loff_t vaddr; list_del(&uprobe->pending_list); if (!ret) { vaddr = vma_address(vma, uprobe->offset); - if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { - put_uprobe(uprobe); - continue; + if (vaddr >= vma->vm_start && vaddr < vma->vm_end) { + ret = install_breakpoint(vma->vm_mm, uprobe, vma, vaddr); + /* Ignore double add: */ + if (ret == -EEXIST) + ret = 0; } - ret = install_breakpoint(vma->vm_mm, uprobe, vma, - vaddr); - if (ret == -EEXIST) - ret = 0; } put_uprobe(uprobe); } diff --git a/mm/mmap.c b/mm/mmap.c index 1aed183636d7..5a863d328a44 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -618,10 +618,10 @@ again: remove_next = 1 + (end > next->vm_end); mutex_unlock(&mapping->i_mmap_mutex); if (root) { - mmap_uprobe(vma); + uprobe_mmap(vma); if (adjust_next) - mmap_uprobe(next); + uprobe_mmap(next); } if (remove_next) { @@ -646,7 +646,7 @@ again: remove_next = 1 + (end > next->vm_end); } } if (insert && file) - mmap_uprobe(insert); + uprobe_mmap(insert); validate_mm(mm); @@ -1340,7 +1340,7 @@ out: } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); - if (file && mmap_uprobe(vma)) + if (file && uprobe_mmap(vma)) /* matching probes but cannot insert */ goto unmap_and_free_vma; @@ -2301,7 +2301,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; - if (vma->vm_file && mmap_uprobe(vma)) + if (vma->vm_file && uprobe_mmap(vma)) return -EINVAL; vma_link(mm, vma, prev, rb_link, rb_parent); @@ -2374,7 +2374,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma->vm_file) { get_file(new_vma->vm_file); - if (mmap_uprobe(new_vma)) + if (uprobe_mmap(new_vma)) goto out_free_mempol; if (vma->vm_flags & VM_EXECUTABLE) -- cgit v1.2.3 From 04a3d984d32e47983770d314cdb4e4d8f38fccb7 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 22 Feb 2012 14:45:35 +0530 Subject: uprobes/core: Make instruction tables volatile Some versions of gcc spits a warning about the asm operand for test_bit and also causes the first long of the instruction table to be output. Fix is similar to 7115e3fc on arch/x86/kernel/kprobes.c Signed-off-by: Srikar Dronamraju Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Anton Arapov Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Jiri Olsa Cc: Josh Stone Link: http://lkml.kernel.org/r/20120222091535.15880.12502.sendpatchset@srdronam.in.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 61 +++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index cf2a18498425..13d616d6519b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -53,34 +53,12 @@ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ << (row % 32)) -#ifdef CONFIG_X86_64 -static u32 good_insns_64[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ---------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ - W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ - W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ - W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ - W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ - /* ---------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -#endif - -/* Good-instruction tables for 32-bit apps */ - -static u32 good_insns_32[256 / 32] = { +/* + * Good-instruction tables for 32-bit apps. This is non-const and volatile + * to keep gcc from statically optimizing it out, as variable_test_bit makes + * some versions of gcc to think only *(unsigned long*) is used. + */ +static volatile u32 good_insns_32[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ @@ -104,7 +82,7 @@ static u32 good_insns_32[256 / 32] = { }; /* Using this for both 64-bit and 32-bit apps */ -static u32 good_2byte_insns[256 / 32] = { +static volatile u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* ---------------------------------------------- */ W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ @@ -127,6 +105,31 @@ static u32 good_2byte_insns[256 / 32] = { /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; +#ifdef CONFIG_X86_64 +/* Good-instruction tables for 64-bit apps */ +static volatile u32 good_insns_64[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#endif #undef W /* -- cgit v1.2.3 From 3ff54efdfaace9e9b2b7c1959a865be6b91de96c Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 22 Feb 2012 14:46:02 +0530 Subject: uprobes/core: Move insn to arch specific structure Few cleanups suggested by Ingo Molnar. - Rename struct uprobe_arch_info to struct arch_uprobe. - Move insn from struct uprobe to struct arch_uprobe. - Make arch specific uprobe functions to accept struct arch_uprobe instead of struct uprobe. - Move struct uprobe to kernel/uprobes.c from include/linux/uprobes.h Signed-off-by: Srikar Dronamraju Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Anton Arapov Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Jiri Olsa Cc: Josh Stone Link: http://lkml.kernel.org/r/20120222091602.15880.40249.sendpatchset@srdronam.in.ibm.com [ Made various small improvements ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 6 ++--- arch/x86/kernel/uprobes.c | 60 +++++++++++++++++++++--------------------- include/linux/uprobes.h | 23 ++-------------- kernel/events/uprobes.c | 38 +++++++++++++++++--------- 4 files changed, 61 insertions(+), 66 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 072df3902636..f7ce310a429d 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -31,13 +31,13 @@ typedef u8 uprobe_opcode_t; #define UPROBES_BKPT_INSN 0xcc #define UPROBES_BKPT_INSN_SIZE 1 -struct uprobe_arch_info { +struct arch_uprobe { u16 fixups; + u8 insn[MAX_UINSN_BYTES]; #ifdef CONFIG_X86_64 unsigned long rip_rela_target_address; #endif }; -struct uprobe; -extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe); +extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *arch_uprobe); #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 13d616d6519b..04dfcef2d028 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -200,9 +200,9 @@ static bool is_prefix_bad(struct insn *insn) return false; } -static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn) +static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) { - insn_init(insn, uprobe->insn, false); + insn_init(insn, auprobe->insn, false); /* Skip good instruction prefixes; reject "bad" ones. */ insn_get_opcode(insn); @@ -222,11 +222,11 @@ static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn) /* * Figure out which fixups post_xol() will need to perform, and annotate - * uprobe->arch_info.fixups accordingly. To start with, - * uprobe->arch_info.fixups is either zero or it reflects rip-related + * arch_uprobe->fixups accordingly. To start with, + * arch_uprobe->fixups is either zero or it reflects rip-related * fixups. */ -static void prepare_fixups(struct uprobe *uprobe, struct insn *insn) +static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) { bool fix_ip = true, fix_call = false; /* defaults */ int reg; @@ -269,17 +269,17 @@ static void prepare_fixups(struct uprobe *uprobe, struct insn *insn) break; } if (fix_ip) - uprobe->arch_info.fixups |= UPROBES_FIX_IP; + auprobe->fixups |= UPROBES_FIX_IP; if (fix_call) - uprobe->arch_info.fixups |= UPROBES_FIX_CALL; + auprobe->fixups |= UPROBES_FIX_CALL; } #ifdef CONFIG_X86_64 /* - * If uprobe->insn doesn't use rip-relative addressing, return + * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses * its memory operand indirectly through a scratch register. Set - * uprobe->arch_info.fixups and uprobe->arch_info.rip_rela_target_address + * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address * accordingly. (The contents of the scratch register will be saved * before we single-step the modified instruction, and restored * afterward.) @@ -297,7 +297,7 @@ static void prepare_fixups(struct uprobe *uprobe, struct insn *insn) * - There's never a SIB byte. * - The displacement is always 4 bytes. */ -static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) +static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; @@ -305,7 +305,7 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru if (mm->context.ia32_compat) return; - uprobe->arch_info.rip_rela_target_address = 0x0; + auprobe->rip_rela_target_address = 0x0; if (!insn_rip_relative(insn)) return; @@ -315,7 +315,7 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru * we want to encode rax/rcx, not r8/r9. */ if (insn->rex_prefix.nbytes) { - cursor = uprobe->insn + insn_offset_rex_prefix(insn); + cursor = auprobe->insn + insn_offset_rex_prefix(insn); *cursor &= 0xfe; /* Clearing REX.B bit */ } @@ -324,7 +324,7 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru * displacement. Beyond the displacement, for some instructions, * is the immediate operand. */ - cursor = uprobe->insn + insn_offset_modrm(insn); + cursor = auprobe->insn + insn_offset_modrm(insn); insn_get_length(insn); /* @@ -341,18 +341,18 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru * is NOT the register operand, so we use %rcx (register * #1) for the scratch register. */ - uprobe->arch_info.fixups = UPROBES_FIX_RIP_CX; + auprobe->fixups = UPROBES_FIX_RIP_CX; /* Change modrm from 00 000 101 to 00 000 001. */ *cursor = 0x1; } else { /* Use %rax (register #0) for the scratch register. */ - uprobe->arch_info.fixups = UPROBES_FIX_RIP_AX; + auprobe->fixups = UPROBES_FIX_RIP_AX; /* Change modrm from 00 xxx 101 to 00 xxx 000 */ *cursor = (reg << 3); } /* Target address = address of next instruction + (signed) offset */ - uprobe->arch_info.rip_rela_target_address = (long)insn->length + insn->displacement.value; + auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; /* Displacement field is gone; slide immediate field (if any) over. */ if (insn->immediate.nbytes) { @@ -362,9 +362,9 @@ static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, stru return; } -static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn) +static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) { - insn_init(insn, uprobe->insn, true); + insn_init(insn, auprobe->insn, true); /* Skip good instruction prefixes; reject "bad" ones. */ insn_get_opcode(insn); @@ -381,42 +381,42 @@ static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn) return -ENOTSUPP; } -static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) +static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) { if (mm->context.ia32_compat) - return validate_insn_32bits(uprobe, insn); - return validate_insn_64bits(uprobe, insn); + return validate_insn_32bits(auprobe, insn); + return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ -static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) +static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) { /* No RIP-relative addressing on 32-bit */ } -static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe, struct insn *insn) +static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) { - return validate_insn_32bits(uprobe, insn); + return validate_insn_32bits(auprobe, insn); } #endif /* CONFIG_X86_64 */ /** * arch_uprobes_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. - * @uprobe: the probepoint information. + * @arch_uprobe: the probepoint information. * Return 0 on success or a -ve number on error. */ -int arch_uprobes_analyze_insn(struct mm_struct *mm, struct uprobe *uprobe) +int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *auprobe) { int ret; struct insn insn; - uprobe->arch_info.fixups = 0; - ret = validate_insn_bits(mm, uprobe, &insn); + auprobe->fixups = 0; + ret = validate_insn_bits(mm, auprobe, &insn); if (ret != 0) return ret; - handle_riprel_insn(mm, uprobe, &insn); - prepare_fixups(uprobe, &insn); + handle_riprel_insn(mm, auprobe, &insn); + prepare_fixups(auprobe, &insn); return 0; } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index fd45b70750d4..9c6be62787ed 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -29,12 +29,6 @@ struct vm_area_struct; #ifdef CONFIG_ARCH_SUPPORTS_UPROBES #include -#else - -typedef u8 uprobe_opcode_t; -struct uprobe_arch_info {}; - -#define MAX_UINSN_BYTES 4 #endif /* flags that denote/change uprobes behaviour */ @@ -56,22 +50,9 @@ struct uprobe_consumer { struct uprobe_consumer *next; }; -struct uprobe { - struct rb_node rb_node; /* node in the rb tree */ - atomic_t ref; - struct rw_semaphore consumer_rwsem; - struct list_head pending_list; - struct uprobe_arch_info arch_info; - struct uprobe_consumer *consumers; - struct inode *inode; /* Also hold a ref to inode */ - loff_t offset; - int flags; - u8 insn[MAX_UINSN_BYTES]; -}; - #ifdef CONFIG_UPROBES -extern int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr); -extern int __weak set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify); +extern int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr); +extern int __weak set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify); extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ee496ad95db3..13f1b5909af4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -65,6 +65,18 @@ struct vma_info { loff_t vaddr; }; +struct uprobe { + struct rb_node rb_node; /* node in the rb tree */ + atomic_t ref; + struct rw_semaphore consumer_rwsem; + struct list_head pending_list; + struct uprobe_consumer *consumers; + struct inode *inode; /* Also hold a ref to inode */ + loff_t offset; + int flags; + struct arch_uprobe arch; +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -180,7 +192,7 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn) /* * write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. - * @uprobe: the breakpointing information. + * @arch_uprobe: the breakpointing information. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * @@ -190,13 +202,14 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn) * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe, +static int write_opcode(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; struct address_space *mapping; void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; + struct uprobe *uprobe; loff_t addr; int ret; @@ -216,6 +229,7 @@ static int write_opcode(struct mm_struct *mm, struct uprobe *uprobe, if (!valid_vma(vma, is_bkpt_insn(&opcode))) goto put_out; + uprobe = container_of(auprobe, struct uprobe, arch); mapping = uprobe->inode->i_mapping; if (mapping != vma->vm_file->f_mapping) goto put_out; @@ -326,7 +340,7 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr) +int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr) { int result; @@ -337,7 +351,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long v if (result) return result; - return write_opcode(mm, uprobe, vaddr, UPROBES_BKPT_INSN); + return write_opcode(mm, auprobe, vaddr, UPROBES_BKPT_INSN); } /** @@ -351,7 +365,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct uprobe *uprobe, unsigned long v * Return 0 (success) or a negative errno. */ int __weak -set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, bool verify) +set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify) { if (verify) { int result; @@ -363,7 +377,7 @@ set_orig_insn(struct mm_struct *mm, struct uprobe *uprobe, unsigned long vaddr, if (result != 1) return result; } - return write_opcode(mm, uprobe, vaddr, *(uprobe_opcode_t *)uprobe->insn); + return write_opcode(mm, auprobe, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -593,13 +607,13 @@ static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, vma, uprobe->insn + nbytes, + if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, bytes - nbytes, uprobe->offset + nbytes)) return -ENOMEM; bytes = nbytes; } - return __copy_insn(mapping, vma, uprobe->insn, bytes, uprobe->offset); + return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); } static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, @@ -625,23 +639,23 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, if (ret) return ret; - if (is_bkpt_insn((uprobe_opcode_t *)uprobe->insn)) + if (is_bkpt_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; - ret = arch_uprobes_analyze_insn(mm, uprobe); + ret = arch_uprobes_analyze_insn(mm, &uprobe->arch); if (ret) return ret; uprobe->flags |= UPROBES_COPY_INSN; } - ret = set_bkpt(mm, uprobe, addr); + ret = set_bkpt(mm, &uprobe->arch, addr); return ret; } static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, loff_t vaddr) { - set_orig_insn(mm, uprobe, (unsigned long)vaddr, true); + set_orig_insn(mm, &uprobe->arch, (unsigned long)vaddr, true); } static void delete_uprobe(struct uprobe *uprobe) -- cgit v1.2.3 From 722bc6b16771ed80871e1fd81c86d3627dda2ac8 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 5 Mar 2012 15:05:13 -0800 Subject: x86/mm: Fix the size calculation of mapping tables For machines that enable PSE, the first 2/4M memory region still uses 4K pages, so needs more PTEs in this case, but find_early_table_space() doesn't count this. This patch fixes it. The bug was found via code review, no misbehavior of the kernel was observed. Signed-off-by: WANG Cong Cc: Yinghai Lu Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/n/tip-kq6a00qe33h7c7ais2xsywnh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6cabf6570d64..2e92fdcbea86 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -30,8 +30,14 @@ int direct_gbpages #endif ; -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +static void __init find_early_table_space(struct map_range *mr, unsigned long end, + int use_pse, int use_gbpages) { unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; @@ -56,6 +62,9 @@ static void __init find_early_table_space(unsigned long end, int use_pse, #ifdef CONFIG_X86_32 extra += PMD_SIZE; #endif + /* The first 2/4M doesn't use large pages. */ + extra += mr->end - mr->start; + ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -85,12 +94,6 @@ void __init native_pagetable_reserve(u64 start, u64 end) memblock_reserve(start, end - start); } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -262,7 +265,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(end, use_pse, use_gbpages); + find_early_table_space(&mr[0], end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, -- cgit v1.2.3 From 510419435c6948fb32959d691bf84eaba41ca474 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:36 +0100 Subject: perf/x86: Implement IBS event configuration This patch implements perf configuration for AMD IBS. The IBS pmu is selected using the type attribute in sysfs. There are two types of ibs pmus, for instruction fetch (IBS_FETCH) and for instruction execution (IBS_OP): /sys/bus/event_source/devices/ibs_fetch/type /sys/bus/event_source/devices/ibs_op/type Except for the sample period IBS can only be set up with raw config values and raw data samples. The event attributes for the syscall should be programmed like this (IBS_FETCH): type = get_pmu_type("/sys/bus/event_source/devices/ibs_fetch/type"); memset(&attr, 0, sizeof(attr)); attr.type = type; attr.sample_type = PERF_SAMPLE_CPU | PERF_SAMPLE_RAW; attr.config = IBS_FETCH_CONFIG_DEFAULT; This implementation does not yet support 64 bit counters. It is limited to the hardware counter bit width which is 20 bits. 64 bit support can be added later. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 92 +++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3b8a2d30d14e..36684eb248de 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -16,12 +16,67 @@ static u32 ibs_caps; #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) -static struct pmu perf_ibs; +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; +}; + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} static int perf_ibs_init(struct perf_event *event) { - if (perf_ibs.type != event->attr.type) + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (!perf_ibs) return -ENOENT; + + config = event->attr.config; + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (hwc->sample_period & 0x0f) + /* lower 4 bits can not be set in ibs max cnt */ + return -EINVAL; + max_cnt = hwc->sample_period >> 4; + if (max_cnt & ~perf_ibs->cnt_mask) + /* out of range */ + return -EINVAL; + config |= max_cnt; + } else { + max_cnt = config & perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!max_cnt) + return -EINVAL; + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + return 0; } @@ -34,10 +89,32 @@ static void perf_ibs_del(struct perf_event *event, int flags) { } -static struct pmu perf_ibs = { - .event_init= perf_ibs_init, - .add= perf_ibs_add, - .del= perf_ibs_del, +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, +}; + +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT, + .enable_mask = IBS_OP_ENABLE, }; static __init int perf_event_ibs_init(void) @@ -45,7 +122,8 @@ static __init int perf_event_ibs_init(void) if (!ibs_caps) return -ENODEV; /* ibs not supported by the cpu */ - perf_pmu_register(&perf_ibs, "ibs", -1); + perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1); + perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; -- cgit v1.2.3 From b7074f1fbd6149eac1ec25063e4a364c39a85473 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:37 +0100 Subject: perf/x86: Implement IBS interrupt handler This patch implements code to handle ibs interrupts. If ibs data is available a raw perf_event data sample is created and sent back to the userland. This patch only implements the storage of ibs data in the raw sample, but this could be extended in a later patch by generating generic event data such as the rip from the ibs sampling data. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/msr-index.h | 5 ++ arch/x86/kernel/cpu/perf_event_amd_ibs.c | 84 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a6962d9161a0..4e3cd382a06f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -127,6 +127,8 @@ #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +#define MSR_AMD64_IBSFETCH_REG_COUNT 3 +#define MSR_AMD64_IBSFETCH_REG_MASK ((1UL< +#include + +#include + #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT @@ -25,6 +30,18 @@ struct perf_ibs { u64 config_mask; u64 cnt_mask; u64 enable_mask; + u64 valid_mask; + unsigned long offset_mask[1]; + int offset_max; +}; + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; }; static struct perf_ibs perf_ibs_fetch; @@ -101,6 +118,9 @@ static struct perf_ibs perf_ibs_fetch = { .config_mask = IBS_FETCH_CONFIG_MASK, .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, }; static struct perf_ibs perf_ibs_op = { @@ -115,8 +135,71 @@ static struct perf_ibs perf_ibs_op = { .config_mask = IBS_OP_CONFIG_MASK, .cnt_mask = IBS_OP_MAX_CNT, .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, }; +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct perf_event *event = NULL; + struct hw_perf_event *hwc = &event->hw; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size; + unsigned int msr; + u64 *buf; + + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + return 0; + + perf_sample_data_init(&data, 0); + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < perf_ibs->offset_max); + raw.size = sizeof(u32) + sizeof(u64) * size; + raw.data = ibs_data.data; + data.raw = &raw; + } + + regs = *iregs; /* XXX: update ip from ibs sample */ + + if (perf_event_overflow(event, &data, ®s)) + ; /* stop */ + else + /* reenable */ + wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); + + return 1; +} + +static int __kprobes +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + static __init int perf_event_ibs_init(void) { if (!ibs_caps) @@ -124,6 +207,7 @@ static __init int perf_event_ibs_init(void) perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1); perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1); + register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; -- cgit v1.2.3 From 4db2e8e6500d9ba6406f2714fa3968b39a325274 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:38 +0100 Subject: perf/x86: Implement IBS pmu control ops Add code to control the IBS pmu. We need to maintain per-cpu states. Since some states are used and changed by the nmi handler, access to these states must be atomic. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-4-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 106 ++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index a7ec6bdf0a63..40a6d9d5dd23 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -24,6 +24,19 @@ static u32 ibs_caps; #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + struct perf_ibs { struct pmu pmu; unsigned int msr; @@ -33,6 +46,7 @@ struct perf_ibs { u64 valid_mask; unsigned long offset_mask[1]; int offset_max; + struct cpu_perf_ibs __percpu *pcpu; }; struct perf_ibs_data { @@ -97,15 +111,66 @@ static int perf_ibs_init(struct perf_event *event) return 0; } +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_STARTED, pcpu->state)) + return; + + wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 val; + + if (!test_and_clear_bit(IBS_STARTED, pcpu->state)) + return; + + set_bit(IBS_STOPPING, pcpu->state); + + rdmsrl(hwc->config_base, val); + val &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, val); +} + static int perf_ibs_add(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + return 0; } static void perf_ibs_del(struct perf_event *event, int flags) { + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, 0); + + pcpu->event = NULL; } +static void perf_ibs_read(struct perf_event *event) { } + static struct perf_ibs perf_ibs_fetch = { .pmu = { .task_ctx_nr = perf_invalid_context, @@ -113,6 +178,9 @@ static struct perf_ibs perf_ibs_fetch = { .event_init = perf_ibs_init, .add = perf_ibs_add, .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_CONFIG_MASK, @@ -130,6 +198,9 @@ static struct perf_ibs perf_ibs_op = { .event_init = perf_ibs_init, .add = perf_ibs_add, .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, @@ -142,7 +213,8 @@ static struct perf_ibs perf_ibs_op = { static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { - struct perf_event *event = NULL; + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; struct perf_raw_record raw; @@ -152,6 +224,14 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) unsigned int msr; u64 *buf; + if (!test_bit(IBS_STARTED, pcpu->state)) { + /* Catch spurious interrupts after stopping IBS: */ + if (!test_and_clear_bit(IBS_STOPPING, pcpu->state)) + return 0; + rdmsrl(perf_ibs->msr, *ibs_data.regs); + return (*ibs_data.regs & perf_ibs->valid_mask) ? 1 : 0; + } + msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); @@ -200,13 +280,33 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) return handled; } +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + static __init int perf_event_ibs_init(void) { if (!ibs_caps) return -ENODEV; /* ibs not supported by the cpu */ - perf_pmu_register(&perf_ibs_fetch.pmu, "ibs_fetch", -1); - perf_pmu_register(&perf_ibs_op.pmu, "ibs_op", -1); + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); -- cgit v1.2.3 From db98c5faf8cb350212ea3af786cb3ba0d4e7a01e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 15 Dec 2011 17:56:39 +0100 Subject: perf/x86: Implement 64-bit counter support for IBS This patch implements 64 bit counter support for IBS. The sampling period is no longer limited to the hw counter width. The functions perf_event_set_period() and perf_event_try_update() can be used as generic functions. They can replace similar code that is duplicate across architectures. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1323968199-9326-5-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 2 + arch/x86/kernel/cpu/perf_event_amd_ibs.c | 204 +++++++++++++++++++++++++++---- 2 files changed, 185 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e8fb2c7a5f4f..9cf66965141d 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -177,6 +177,8 @@ struct x86_pmu_capability { #define IBS_FETCH_MAX_CNT 0x0000FFFFULL /* IbsOpCtl bits */ +/* lower 4 bits of the current count are ignored: */ +#define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 40a6d9d5dd23..573d24873459 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -44,9 +44,11 @@ struct perf_ibs { u64 cnt_mask; u64 enable_mask; u64 valid_mask; + u64 max_period; unsigned long offset_mask[1]; int offset_max; struct cpu_perf_ibs __percpu *pcpu; + u64 (*get_count)(u64 config); }; struct perf_ibs_data { @@ -58,6 +60,78 @@ struct perf_ibs_data { u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; }; +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left <= 0)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < min)) + left = min; + + if (left > max) + left = max; + + *count = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + static struct perf_ibs perf_ibs_fetch; static struct perf_ibs perf_ibs_op; @@ -91,18 +165,14 @@ static int perf_ibs_init(struct perf_event *event) if (hwc->sample_period & 0x0f) /* lower 4 bits can not be set in ibs max cnt */ return -EINVAL; - max_cnt = hwc->sample_period >> 4; - if (max_cnt & ~perf_ibs->cnt_mask) - /* out of range */ - return -EINVAL; - config |= max_cnt; } else { max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; event->attr.sample_period = max_cnt << 4; hwc->sample_period = event->attr.sample_period; } - if (!max_cnt) + if (!hwc->sample_period) return -EINVAL; hwc->config_base = perf_ibs->msr; @@ -111,16 +181,71 @@ static int perf_ibs_init(struct perf_event *event) return 0; } +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int ret; + + /* ignore lower 4 bits in min count: */ + ret = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return ret; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ + return (config & IBS_OP_CUR_CNT) >> 32; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 config) +{ + u64 count = perf_ibs->get_count(config); + + while (!perf_event_try_update(event, count, 20)) { + rdmsrl(event->hw.config_base, config); + count = perf_ibs->get_count(config); + } +} + +/* Note: The enable mask must be encoded in the config argument. */ +static inline void perf_ibs_enable_event(struct hw_perf_event *hwc, u64 config) +{ + wrmsrl(hwc->config_base, hwc->config | config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ static void perf_ibs_start(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; - if (test_and_set_bit(IBS_STARTED, pcpu->state)) + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; - wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &config); + config = (config >> 4) | perf_ibs->enable_mask; + set_bit(IBS_STARTED, pcpu->state); + perf_ibs_enable_event(hwc, config); + + perf_event_update_userpage(event); } static void perf_ibs_stop(struct perf_event *event, int flags) @@ -129,15 +254,28 @@ static void perf_ibs_stop(struct perf_event *event, int flags) struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); u64 val; + int stopping; - if (!test_and_clear_bit(IBS_STARTED, pcpu->state)) - return; + stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); - set_bit(IBS_STOPPING, pcpu->state); + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; rdmsrl(hwc->config_base, val); - val &= ~perf_ibs->enable_mask; - wrmsrl(hwc->config_base, val); + + if (stopping) { + set_bit(IBS_STOPPING, pcpu->state); + val &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, val); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + perf_ibs_event_update(perf_ibs, event, val); + hwc->state |= PERF_HES_UPTODATE; } static int perf_ibs_add(struct perf_event *event, int flags) @@ -148,6 +286,8 @@ static int perf_ibs_add(struct perf_event *event, int flags) if (test_and_set_bit(IBS_ENABLED, pcpu->state)) return -ENOSPC; + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + pcpu->event = event; if (flags & PERF_EF_START) @@ -164,9 +304,11 @@ static void perf_ibs_del(struct perf_event *event, int flags) if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) return; - perf_ibs_stop(event, 0); + perf_ibs_stop(event, PERF_EF_UPDATE); pcpu->event = NULL; + + perf_event_update_userpage(event); } static void perf_ibs_read(struct perf_event *event) { } @@ -187,8 +329,11 @@ static struct perf_ibs perf_ibs_fetch = { .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + + .get_count = get_ibs_fetch_count, }; static struct perf_ibs perf_ibs_op = { @@ -207,8 +352,11 @@ static struct perf_ibs perf_ibs_op = { .cnt_mask = IBS_OP_MAX_CNT, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + + .get_count = get_ibs_op_count, }; static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) @@ -220,9 +368,9 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_raw_record raw; struct pt_regs regs; struct perf_ibs_data ibs_data; - int offset, size; + int offset, size, overflow, reenable; unsigned int msr; - u64 *buf; + u64 *buf, config; if (!test_bit(IBS_STARTED, pcpu->state)) { /* Catch spurious interrupts after stopping IBS: */ @@ -257,11 +405,25 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs = *iregs; /* XXX: update ip from ibs sample */ - if (perf_event_overflow(event, &data, ®s)) - ; /* stop */ - else - /* reenable */ - wrmsrl(hwc->config_base, hwc->config | perf_ibs->enable_mask); + /* + * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not + * supported in all cpus. As this triggered an interrupt, we + * set the current count to the max count. + */ + config = ibs_data.regs[0]; + if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { + config &= ~IBS_OP_CUR_CNT; + config |= (config & IBS_OP_MAX_CNT) << 36; + } + + perf_ibs_event_update(perf_ibs, event, config); + + overflow = perf_ibs_set_period(perf_ibs, hwc, &config); + reenable = !(overflow && perf_event_overflow(event, &data, ®s)); + config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0); + perf_ibs_enable_event(hwc, config); + + perf_event_update_userpage(event); return 1; } -- cgit v1.2.3 From 900771a483ef28915a48066d7895d8252315607a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Mon, 12 Mar 2012 14:55:14 +0530 Subject: uprobes/core: Make macro names consistent Rename macros that refer to individual uprobe to start with UPROBE_ instead of UPROBES_. This is pure cleanup, no functional change intended. Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120312092514.5379.36595.sendpatchset@srdronam.in.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 6 +++--- arch/x86/kernel/uprobes.c | 18 +++++++++--------- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 18 +++++++++--------- 4 files changed, 23 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index f7ce310a429d..5c399e446512 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -26,10 +26,10 @@ typedef u8 uprobe_opcode_t; #define MAX_UINSN_BYTES 16 -#define UPROBES_XOL_SLOT_BYTES 128 /* to keep it cache aligned */ +#define UPROBE_XOL_SLOT_BYTES 128 /* to keep it cache aligned */ -#define UPROBES_BKPT_INSN 0xcc -#define UPROBES_BKPT_INSN_SIZE 1 +#define UPROBE_BKPT_INSN 0xcc +#define UPROBE_BKPT_INSN_SIZE 1 struct arch_uprobe { u16 fixups; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 04dfcef2d028..6dfa89e6f24a 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -31,14 +31,14 @@ /* Post-execution fixups. */ /* No fixup needed */ -#define UPROBES_FIX_NONE 0x0 +#define UPROBE_FIX_NONE 0x0 /* Adjust IP back to vicinity of actual insn */ -#define UPROBES_FIX_IP 0x1 +#define UPROBE_FIX_IP 0x1 /* Adjust the return address of a call insn */ -#define UPROBES_FIX_CALL 0x2 +#define UPROBE_FIX_CALL 0x2 -#define UPROBES_FIX_RIP_AX 0x8000 -#define UPROBES_FIX_RIP_CX 0x4000 +#define UPROBE_FIX_RIP_AX 0x8000 +#define UPROBE_FIX_RIP_CX 0x4000 /* Adaptations for mhiramat x86 decoder v14. */ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) @@ -269,9 +269,9 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) break; } if (fix_ip) - auprobe->fixups |= UPROBES_FIX_IP; + auprobe->fixups |= UPROBE_FIX_IP; if (fix_call) - auprobe->fixups |= UPROBES_FIX_CALL; + auprobe->fixups |= UPROBE_FIX_CALL; } #ifdef CONFIG_X86_64 @@ -341,12 +341,12 @@ static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe * is NOT the register operand, so we use %rcx (register * #1) for the scratch register. */ - auprobe->fixups = UPROBES_FIX_RIP_CX; + auprobe->fixups = UPROBE_FIX_RIP_CX; /* Change modrm from 00 000 101 to 00 000 001. */ *cursor = 0x1; } else { /* Use %rax (register #0) for the scratch register. */ - auprobe->fixups = UPROBES_FIX_RIP_AX; + auprobe->fixups = UPROBE_FIX_RIP_AX; /* Change modrm from 00 xxx 101 to 00 xxx 000 */ *cursor = (reg << 3); } diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f85797e1ccd4..838fb312926a 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -35,10 +35,10 @@ struct vm_area_struct; /* flags that denote/change uprobes behaviour */ /* Have a copy of original instruction */ -#define UPROBES_COPY_INSN 0x1 +#define UPROBE_COPY_INSN 0x1 /* Dont run handlers when first register/ last unregister in progress*/ -#define UPROBES_RUN_HANDLER 0x2 +#define UPROBE_RUN_HANDLER 0x2 struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5ce32e3ae9e9..0d36bf3920ba 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -177,7 +177,7 @@ out: */ bool __weak is_bkpt_insn(uprobe_opcode_t *insn) { - return *insn == UPROBES_BKPT_INSN; + return *insn == UPROBE_BKPT_INSN; } /* @@ -259,8 +259,8 @@ static int write_opcode(struct mm_struct *mm, struct arch_uprobe *auprobe, /* poke the new insn in, ASSUMES we don't cross page boundary */ vaddr &= ~PAGE_MASK; - BUG_ON(vaddr + UPROBES_BKPT_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + vaddr, &opcode, UPROBES_BKPT_INSN_SIZE); + BUG_ON(vaddr + UPROBE_BKPT_INSN_SIZE > PAGE_SIZE); + memcpy(vaddr_new + vaddr, &opcode, UPROBE_BKPT_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -308,7 +308,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ lock_page(page); vaddr_new = kmap_atomic(page); vaddr &= ~PAGE_MASK; - memcpy(opcode, vaddr_new + vaddr, UPROBES_BKPT_INSN_SIZE); + memcpy(opcode, vaddr_new + vaddr, UPROBE_BKPT_INSN_SIZE); kunmap_atomic(vaddr_new); unlock_page(page); @@ -352,7 +352,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned if (result) return result; - return write_opcode(mm, auprobe, vaddr, UPROBES_BKPT_INSN); + return write_opcode(mm, auprobe, vaddr, UPROBE_BKPT_INSN); } /** @@ -635,7 +635,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, addr = (unsigned long)vaddr; - if (!(uprobe->flags & UPROBES_COPY_INSN)) { + if (!(uprobe->flags & UPROBE_COPY_INSN)) { ret = copy_insn(uprobe, vma, addr); if (ret) return ret; @@ -647,7 +647,7 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, if (ret) return ret; - uprobe->flags |= UPROBES_COPY_INSN; + uprobe->flags |= UPROBE_COPY_INSN; } ret = set_bkpt(mm, &uprobe->arch, addr); @@ -857,7 +857,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * uprobe->consumers = NULL; __uprobe_unregister(uprobe); } else { - uprobe->flags |= UPROBES_RUN_HANDLER; + uprobe->flags |= UPROBE_RUN_HANDLER; } } @@ -889,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume if (consumer_del(uprobe, consumer)) { if (!uprobe->consumers) { __uprobe_unregister(uprobe); - uprobe->flags &= ~UPROBES_RUN_HANDLER; + uprobe->flags &= ~UPROBE_RUN_HANDLER; } } -- cgit v1.2.3 From e3343e6a2819ff5d0dfc4bb5c9fb7f9a4d04da73 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Mon, 12 Mar 2012 14:55:30 +0530 Subject: uprobes/core: Make order of function parameters consistent across functions If a function takes struct uprobe or struct arch_uprobe, then it is passed as the first parameter. This is pure cleanup, no functional change intended. Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120312092530.5379.18394.sendpatchset@srdronam.in.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 2 +- arch/x86/kernel/uprobes.c | 15 +++---- include/linux/uprobes.h | 12 +++--- kernel/events/uprobes.c | 93 ++++++++++++++++++++++-------------------- 4 files changed, 63 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 5c399e446512..384f1bebf884 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -39,5 +39,5 @@ struct arch_uprobe { #endif }; -extern int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *arch_uprobe); +extern int arch_uprobes_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6dfa89e6f24a..851a11b0d38c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -297,7 +297,8 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) * - There's never a SIB byte. * - The displacement is always 4 bytes. */ -static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) +static void +handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) { u8 *cursor; u8 reg; @@ -381,19 +382,19 @@ static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } -static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) +static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) { if (mm->context.ia32_compat) return validate_insn_32bits(auprobe, insn); return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ -static void handle_riprel_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) { /* No RIP-relative addressing on 32-bit */ } -static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, struct insn *insn) +static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) { return validate_insn_32bits(auprobe, insn); } @@ -405,17 +406,17 @@ static int validate_insn_bits(struct mm_struct *mm, struct arch_uprobe *auprobe, * @arch_uprobe: the probepoint information. * Return 0 on success or a -ve number on error. */ -int arch_uprobes_analyze_insn(struct mm_struct *mm, struct arch_uprobe *auprobe) +int arch_uprobes_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) { int ret; struct insn insn; auprobe->fixups = 0; - ret = validate_insn_bits(mm, auprobe, &insn); + ret = validate_insn_bits(auprobe, mm, &insn); if (ret != 0) return ret; - handle_riprel_insn(mm, auprobe, &insn); + handle_riprel_insn(auprobe, mm, &insn); prepare_fixups(auprobe, &insn); return 0; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 838fb312926a..58699182e9a7 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -52,20 +52,20 @@ struct uprobe_consumer { }; #ifdef CONFIG_UPROBES -extern int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr); -extern int __weak set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify); +extern int __weak set_bkpt(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr, bool verify); extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn); -extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer); -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer); +extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); #else /* CONFIG_UPROBES is not defined */ static inline int -uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) +uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { return -ENOSYS; } static inline void -uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) +uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { } static inline int uprobe_mmap(struct vm_area_struct *vma) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0d36bf3920ba..9c5ddff1c8da 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,8 +192,8 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn) /* * write_opcode - write the opcode at a given virtual address. + * @auprobe: arch breakpointing information. * @mm: the probed process address space. - * @arch_uprobe: the breakpointing information. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * @@ -203,7 +203,7 @@ bool __weak is_bkpt_insn(uprobe_opcode_t *insn) * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct mm_struct *mm, struct arch_uprobe *auprobe, +static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; @@ -334,14 +334,14 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) /** * set_bkpt - store breakpoint at a given address. + * @auprobe: arch specific probepoint information. * @mm: the probed process address space. - * @uprobe: the probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr) +int __weak set_bkpt(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { int result; @@ -352,13 +352,13 @@ int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned if (result) return result; - return write_opcode(mm, auprobe, vaddr, UPROBE_BKPT_INSN); + return write_opcode(auprobe, mm, vaddr, UPROBE_BKPT_INSN); } /** * set_orig_insn - Restore the original instruction. * @mm: the probed process address space. - * @uprobe: the probepoint information. + * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * @verify: if true, verify existance of breakpoint instruction. * @@ -366,7 +366,7 @@ int __weak set_bkpt(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned * Return 0 (success) or a negative errno. */ int __weak -set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long vaddr, bool verify) +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) { if (verify) { int result; @@ -378,7 +378,7 @@ set_orig_insn(struct mm_struct *mm, struct arch_uprobe *auprobe, unsigned long v if (result != 1) return result; } - return write_opcode(mm, auprobe, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -525,30 +525,30 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) /* Returns the previous consumer */ static struct uprobe_consumer * -consumer_add(struct uprobe *uprobe, struct uprobe_consumer *consumer) +consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); - consumer->next = uprobe->consumers; - uprobe->consumers = consumer; + uc->next = uprobe->consumers; + uprobe->consumers = uc; up_write(&uprobe->consumer_rwsem); - return consumer->next; + return uc->next; } /* - * For uprobe @uprobe, delete the consumer @consumer. - * Return true if the @consumer is deleted successfully + * For uprobe @uprobe, delete the consumer @uc. + * Return true if the @uc is deleted successfully * or return false. */ -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *consumer) +static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { struct uprobe_consumer **con; bool ret = false; down_write(&uprobe->consumer_rwsem); for (con = &uprobe->consumers; *con; con = &(*con)->next) { - if (*con == consumer) { - *con = consumer->next; + if (*con == uc) { + *con = uc->next; ret = true; break; } @@ -558,8 +558,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *consumer return ret; } -static int __copy_insn(struct address_space *mapping, - struct vm_area_struct *vma, char *insn, +static int +__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, unsigned long nbytes, unsigned long offset) { struct file *filp = vma->vm_file; @@ -590,7 +590,8 @@ static int __copy_insn(struct address_space *mapping, return 0; } -static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +static int +copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping; unsigned long nbytes; @@ -617,8 +618,9 @@ static int copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); } -static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, - struct vm_area_struct *vma, loff_t vaddr) +static int +install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, + struct vm_area_struct *vma, loff_t vaddr) { unsigned long addr; int ret; @@ -643,20 +645,21 @@ static int install_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, if (is_bkpt_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; - ret = arch_uprobes_analyze_insn(mm, &uprobe->arch); + ret = arch_uprobes_analyze_insn(&uprobe->arch, mm); if (ret) return ret; uprobe->flags |= UPROBE_COPY_INSN; } - ret = set_bkpt(mm, &uprobe->arch, addr); + ret = set_bkpt(&uprobe->arch, mm, addr); return ret; } -static void remove_breakpoint(struct mm_struct *mm, struct uprobe *uprobe, loff_t vaddr) +static void +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) { - set_orig_insn(mm, &uprobe->arch, (unsigned long)vaddr, true); + set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true); } static void delete_uprobe(struct uprobe *uprobe) @@ -671,9 +674,9 @@ static void delete_uprobe(struct uprobe *uprobe) atomic_dec(&uprobe_events); } -static struct vma_info *__find_next_vma_info(struct list_head *head, - loff_t offset, struct address_space *mapping, - struct vma_info *vi, bool is_register) +static struct vma_info * +__find_next_vma_info(struct address_space *mapping, struct list_head *head, + struct vma_info *vi, loff_t offset, bool is_register) { struct prio_tree_iter iter; struct vm_area_struct *vma; @@ -719,8 +722,8 @@ static struct vma_info *__find_next_vma_info(struct list_head *head, * yet been inserted. */ static struct vma_info * -find_next_vma_info(struct list_head *head, loff_t offset, struct address_space *mapping, - bool is_register) +find_next_vma_info(struct address_space *mapping, struct list_head *head, + loff_t offset, bool is_register) { struct vma_info *vi, *retvi; @@ -729,7 +732,7 @@ find_next_vma_info(struct list_head *head, loff_t offset, struct address_space * return ERR_PTR(-ENOMEM); mutex_lock(&mapping->i_mmap_mutex); - retvi = __find_next_vma_info(head, offset, mapping, vi, is_register); + retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); mutex_unlock(&mapping->i_mmap_mutex); if (!retvi) @@ -754,7 +757,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) ret = 0; for (;;) { - vi = find_next_vma_info(&try_list, uprobe->offset, mapping, is_register); + vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); if (!vi) break; @@ -784,9 +787,9 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) } if (is_register) - ret = install_breakpoint(mm, uprobe, vma, vi->vaddr); + ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); else - remove_breakpoint(mm, uprobe, vi->vaddr); + remove_breakpoint(uprobe, mm, vi->vaddr); up_read(&mm->mmap_sem); mmput(mm); @@ -823,25 +826,25 @@ static void __uprobe_unregister(struct uprobe *uprobe) * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. - * @consumer: information on howto handle the probe.. + * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation - * refcount is released when the last @consumer for the @uprobe + * refcount is released when the last @uc for the @uprobe * unregisters. * * Return errno if it cannot successully install probes * else return 0 (success) */ -int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) +int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; - if (!inode || !consumer || consumer->next) + if (!inode || !uc || uc->next) return -EINVAL; if (offset > i_size_read(inode)) @@ -851,7 +854,7 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * mutex_lock(uprobes_hash(inode)); uprobe = alloc_uprobe(inode, offset); - if (uprobe && !consumer_add(uprobe, consumer)) { + if (uprobe && !consumer_add(uprobe, uc)) { ret = __uprobe_register(uprobe); if (ret) { uprobe->consumers = NULL; @@ -871,13 +874,13 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * * uprobe_unregister - unregister a already registered probe. * @inode: the file in which the probe has to be removed. * @offset: offset from the start of the file. - * @consumer: identify which probe if multiple probes are colocated. + * @uc: identify which probe if multiple probes are colocated. */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *consumer) +void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; - if (!inode || !consumer) + if (!inode || !uc) return; uprobe = find_uprobe(inode, offset); @@ -886,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume mutex_lock(uprobes_hash(inode)); - if (consumer_del(uprobe, consumer)) { + if (consumer_del(uprobe, uc)) { if (!uprobe->consumers) { __uprobe_unregister(uprobe); uprobe->flags &= ~UPROBE_RUN_HANDLER; @@ -993,7 +996,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!ret) { vaddr = vma_address(vma, uprobe->offset); if (vaddr >= vma->vm_start && vaddr < vma->vm_end) { - ret = install_breakpoint(vma->vm_mm, uprobe, vma, vaddr); + ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); /* Ignore double add: */ if (ret == -EEXIST) ret = 0; -- cgit v1.2.3 From 5cb4ac3a583d4ee18c8682ab857e093c4a0d0895 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Mon, 12 Mar 2012 14:55:45 +0530 Subject: uprobes/core: Rename bkpt to swbp bkpt doesnt seem to be a correct abbrevation for breakpoint. Choice was between bp and breakpoint. Since bp can refer to things other than breakpoint, use swbp to refer to breakpoints. This is pure cleanup, no functional change intended. Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120312092545.5379.91251.sendpatchset@srdronam.in.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 4 ++-- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 34 +++++++++++++++++----------------- 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 384f1bebf884..0500391f57d0 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -28,8 +28,8 @@ typedef u8 uprobe_opcode_t; #define MAX_UINSN_BYTES 16 #define UPROBE_XOL_SLOT_BYTES 128 /* to keep it cache aligned */ -#define UPROBE_BKPT_INSN 0xcc -#define UPROBE_BKPT_INSN_SIZE 1 +#define UPROBE_SWBP_INSN 0xcc +#define UPROBE_SWBP_INSN_SIZE 1 struct arch_uprobe { u16 fixups; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 58699182e9a7..eac525f41b94 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -52,9 +52,9 @@ struct uprobe_consumer { }; #ifdef CONFIG_UPROBES -extern int __weak set_bkpt(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr, bool verify); -extern bool __weak is_bkpt_insn(uprobe_opcode_t *insn); +extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9c5ddff1c8da..e56e56aa7535 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -170,14 +170,14 @@ out: } /** - * is_bkpt_insn - check if instruction is breakpoint instruction. + * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. - * Default implementation of is_bkpt_insn + * Default implementation of is_swbp_insn * Returns true if @insn is a breakpoint instruction. */ -bool __weak is_bkpt_insn(uprobe_opcode_t *insn) +bool __weak is_swbp_insn(uprobe_opcode_t *insn) { - return *insn == UPROBE_BKPT_INSN; + return *insn == UPROBE_SWBP_INSN; } /* @@ -227,7 +227,7 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, * adding probes in write mapped pages since the breakpoints * might end up in the file copy. */ - if (!valid_vma(vma, is_bkpt_insn(&opcode))) + if (!valid_vma(vma, is_swbp_insn(&opcode))) goto put_out; uprobe = container_of(auprobe, struct uprobe, arch); @@ -259,8 +259,8 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, /* poke the new insn in, ASSUMES we don't cross page boundary */ vaddr &= ~PAGE_MASK; - BUG_ON(vaddr + UPROBE_BKPT_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + vaddr, &opcode, UPROBE_BKPT_INSN_SIZE); + BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -308,7 +308,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ lock_page(page); vaddr_new = kmap_atomic(page); vaddr &= ~PAGE_MASK; - memcpy(opcode, vaddr_new + vaddr, UPROBE_BKPT_INSN_SIZE); + memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); unlock_page(page); @@ -317,7 +317,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ return 0; } -static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) { uprobe_opcode_t opcode; int result; @@ -326,14 +326,14 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result) return result; - if (is_bkpt_insn(&opcode)) + if (is_swbp_insn(&opcode)) return 1; return 0; } /** - * set_bkpt - store breakpoint at a given address. + * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @mm: the probed process address space. * @vaddr: the virtual address to insert the opcode. @@ -341,18 +341,18 @@ static int is_bkpt_at_addr(struct mm_struct *mm, unsigned long vaddr) * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ -int __weak set_bkpt(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) +int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { int result; - result = is_bkpt_at_addr(mm, vaddr); + result = is_swbp_at_addr(mm, vaddr); if (result == 1) return -EEXIST; if (result) return result; - return write_opcode(auprobe, mm, vaddr, UPROBE_BKPT_INSN); + return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -371,7 +371,7 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v if (verify) { int result; - result = is_bkpt_at_addr(mm, vaddr); + result = is_swbp_at_addr(mm, vaddr); if (!result) return -EINVAL; @@ -642,7 +642,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (ret) return ret; - if (is_bkpt_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; ret = arch_uprobes_analyze_insn(&uprobe->arch, mm); @@ -651,7 +651,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, uprobe->flags |= UPROBE_COPY_INSN; } - ret = set_bkpt(&uprobe->arch, mm, addr); + ret = set_swbp(&uprobe->arch, mm, addr); return ret; } -- cgit v1.2.3 From 0326f5a94ddea33fa331b2519f4172f4fb387baa Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 13 Mar 2012 23:30:11 +0530 Subject: uprobes/core: Handle breakpoint and singlestep exceptions Uprobes uses exception notifiers to get to know if a thread hit a breakpoint or a singlestep exception. When a thread hits a uprobe or is singlestepping post a uprobe hit, the uprobe exception notifier sets its TIF_UPROBE bit, which will then be checked on its return to userspace path (do_notify_resume() ->uprobe_notify_resume()), where the consumers handlers are run (in task context) based on the defined filters. Uprobe hits are thread specific and hence we need to maintain information about if a task hit a uprobe, what uprobe was hit, the slot where the original instruction was copied for xol so that it can be singlestepped with appropriate fixups. In some cases, special care is needed for instructions that are executed out of line (xol). These are architecture specific artefacts, such as handling RIP relative instructions on x86_64. Since the instruction at which the uprobe was inserted is executed out of line, architecture specific fixups are added so that the thread continues normal execution in the presence of a uprobe. Postpone the signals until we execute the probed insn. post_xol() path does a recalc_sigpending() before return to user-mode, this ensures the signal can't be lost. Uprobes relies on DIE_DEBUG notification to notify if a singlestep is complete. Adds x86 specific uprobe exception notifiers and appropriate hooks needed to determine a uprobe hit and subsequent post processing. Add requisite x86 fixups for xol for uprobes. Specific cases needing fixups include relative jumps (x86_64), calls, etc. Where possible, we check and skip singlestepping the breakpointed instructions. For now we skip single byte as well as few multibyte nop instructions. However this can be extended to other instructions too. Credits to Oleg Nesterov for suggestions/patches related to signal, breakpoint, singlestep handling code. Signed-off-by: Srikar Dronamraju Cc: Linus Torvalds Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Linux-mm Cc: Oleg Nesterov Cc: Andi Kleen Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120313180011.29771.89027.sendpatchset@srdronam.in.ibm.com [ Performed various cleanliness edits ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 2 + arch/x86/include/asm/uprobes.h | 16 +- arch/x86/kernel/signal.c | 6 + arch/x86/kernel/uprobes.c | 265 +++++++++++++++++++++++++++++- include/linux/sched.h | 4 + include/linux/uprobes.h | 55 ++++++- kernel/events/uprobes.c | 323 ++++++++++++++++++++++++++++++++++++- kernel/fork.c | 4 + kernel/signal.c | 4 + 9 files changed, 664 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6df8ccd715..0710c11305d4 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -85,6 +85,7 @@ struct thread_info { #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ +#define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -109,6 +110,7 @@ struct thread_info { #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) +#define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 0500391f57d0..1e9bed14f7ae 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -23,6 +23,8 @@ * Jim Keniston */ +#include + typedef u8 uprobe_opcode_t; #define MAX_UINSN_BYTES 16 @@ -39,5 +41,17 @@ struct arch_uprobe { #endif }; -extern int arch_uprobes_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +struct arch_uprobe_task { + unsigned long saved_trap_nr; +#ifdef CONFIG_X86_64 + unsigned long saved_scratch_register; +#endif +}; + +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); +extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); +extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); +extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); +extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); #endif /* _ASM_UPROBES_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9c73acc1c860..b3cd6913ceea 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -823,6 +824,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + if (thread_info_flags & _TIF_UPROBE) { + clear_thread_flag(TIF_UPROBE); + uprobe_notify_resume(regs); + } + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 851a11b0d38c..dc4e910a7d96 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -24,22 +24,28 @@ #include #include #include +#include #include +#include #include /* Post-execution fixups. */ /* No fixup needed */ -#define UPROBE_FIX_NONE 0x0 +#define UPROBE_FIX_NONE 0x0 + /* Adjust IP back to vicinity of actual insn */ #define UPROBE_FIX_IP 0x1 + /* Adjust the return address of a call insn */ #define UPROBE_FIX_CALL 0x2 #define UPROBE_FIX_RIP_AX 0x8000 #define UPROBE_FIX_RIP_CX 0x4000 +#define UPROBE_TRAP_NR UINT_MAX + /* Adaptations for mhiramat x86 decoder v14. */ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) @@ -221,10 +227,9 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) } /* - * Figure out which fixups post_xol() will need to perform, and annotate - * arch_uprobe->fixups accordingly. To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related - * fixups. + * Figure out which fixups arch_uprobe_post_xol() will need to perform, and + * annotate arch_uprobe->fixups accordingly. To start with, + * arch_uprobe->fixups is either zero or it reflects rip-related fixups. */ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) { @@ -401,12 +406,12 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, #endif /* CONFIG_X86_64 */ /** - * arch_uprobes_analyze_insn - instruction analysis including validity and fixups. + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @arch_uprobe: the probepoint information. * Return 0 on success or a -ve number on error. */ -int arch_uprobes_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) { int ret; struct insn insn; @@ -421,3 +426,249 @@ int arch_uprobes_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) return 0; } + +#ifdef CONFIG_X86_64 +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} +#else +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + /* No RIP-relative addressing on 32-bit */ +} +#endif + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + autask->saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + regs->ip = current->utask->xol_vaddr; + pre_xol_rip_insn(auprobe, regs, autask); + + return 0; +} + +/* + * This function is called by arch_uprobe_post_xol() to adjust the return + * address pushed by a call instruction executed out of line. + */ +static int adjust_ret_addr(unsigned long sp, long correction) +{ + int rasize, ncopied; + long ra = 0; + + if (is_ia32_task()) + rasize = 4; + else + rasize = 8; + + ncopied = copy_from_user(&ra, (void __user *)sp, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + ra += correction; + ncopied = copy_to_user((void __user *)sp, &ra, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + return 0; +} + +#ifdef CONFIG_X86_64 +static bool is_riprel_insn(struct arch_uprobe *auprobe) +{ + return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (is_riprel_insn(auprobe)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Fall through to handle stuff like "jmpq *...(%rip)" and + * "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } +} +#else +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + /* No RIP-relative addressing on 32-bit */ +} +#endif + +/* + * If xol insn itself traps and generates a signal(Say, + * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped + * instruction jumps back to its own address. It is assumed that anything + * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. + * + * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, + * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to + * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). + */ +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + if (t->thread.trap_nr != UPROBE_TRAP_NR) + return true; + + return false; +} + +/* + * Called after single-stepping. To avoid the SMP problems that can + * occur when we temporarily put back the original opcode to + * single-step, we single-stepped a copy of the instruction. + * + * This function prepares to resume execution after the single-step. + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,(%rax)". + * We need to restore the contents of the scratch register and adjust + * the ip, keeping in mind that the instruction we executed is 4 bytes + * shorter than the original instruction (since we squeezed out the offset + * field). (FIX_RIP_AX or FIX_RIP_CX) + */ +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask; + long correction; + int result = 0; + + WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + utask = current->utask; + current->thread.trap_nr = utask->autask.saved_trap_nr; + correction = (long)(utask->vaddr - utask->xol_vaddr); + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) + result = adjust_ret_addr(regs->sp, correction); + + return result; +} + +/* callback routine for handling exceptions. */ +int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + int ret = NOTIFY_DONE; + + /* We are only interested in userspace traps */ + if (regs && !user_mode_vm(regs)) + return NOTIFY_DONE; + + switch (val) { + case DIE_INT3: + if (uprobe_pre_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + break; + + case DIE_DEBUG: + if (uprobe_post_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + default: + break; + } + + return ret; +} + +/* + * This function gets called when XOL instruction either gets trapped or + * the thread has a fatal signal, so reset the instruction pointer to its + * probed address. + */ +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + current->thread.trap_nr = utask->autask.saved_trap_nr; + handle_riprel_post_xol(auprobe, regs, NULL); + instruction_pointer_set(regs, utask->vaddr); +} + +/* + * Skip these instructions as per the currently known x86 ISA. + * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } + */ +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + int i; + + for (i = 0; i < MAX_UINSN_BYTES; i++) { + if ((auprobe->insn[i] == 0x66)) + continue; + + if (auprobe->insn[i] == 0x90) + return true; + + if (i == (MAX_UINSN_BYTES - 1)) + break; + + if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f)) + return true; + + if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19)) + return true; + + if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0)) + return true; + + break; + } + return false; +} diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d379a6bfd88..8379e3771690 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1590,6 +1590,10 @@ struct task_struct { #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; #endif +#ifdef CONFIG_UPROBES + struct uprobe_task *utask; + int uprobe_srcu_id; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index eac525f41b94..5ec778fdce6f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -28,8 +28,9 @@ #include struct vm_area_struct; + #ifdef CONFIG_ARCH_SUPPORTS_UPROBES -#include +# include #endif /* flags that denote/change uprobes behaviour */ @@ -39,6 +40,8 @@ struct vm_area_struct; /* Dont run handlers when first register/ last unregister in progress*/ #define UPROBE_RUN_HANDLER 0x2 +/* Can skip singlestep */ +#define UPROBE_SKIP_SSTEP 0x4 struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); @@ -52,13 +55,42 @@ struct uprobe_consumer { }; #ifdef CONFIG_UPROBES +enum uprobe_task_state { + UTASK_RUNNING, + UTASK_BP_HIT, + UTASK_SSTEP, + UTASK_SSTEP_ACK, + UTASK_SSTEP_TRAPPED, +}; + +/* + * uprobe_task: Metadata of a task while it singlesteps. + */ +struct uprobe_task { + enum uprobe_task_state state; + struct arch_uprobe_task autask; + + struct uprobe *active_uprobe; + + unsigned long xol_vaddr; + unsigned long vaddr; +}; + extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr, bool verify); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); -#else /* CONFIG_UPROBES is not defined */ +extern void uprobe_free_utask(struct task_struct *t); +extern void uprobe_copy_process(struct task_struct *t); +extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern int uprobe_post_sstep_notifier(struct pt_regs *regs); +extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); +extern void uprobe_notify_resume(struct pt_regs *regs); +extern bool uprobe_deny_signal(void); +extern bool __weak arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs); +#else /* !CONFIG_UPROBES */ static inline int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) { @@ -72,5 +104,22 @@ static inline int uprobe_mmap(struct vm_area_struct *vma) { return 0; } -#endif /* CONFIG_UPROBES */ +static inline void uprobe_notify_resume(struct pt_regs *regs) +{ +} +static inline bool uprobe_deny_signal(void) +{ + return false; +} +static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return 0; +} +static inline void uprobe_free_utask(struct task_struct *t) +{ +} +static inline void uprobe_copy_process(struct task_struct *t) +{ +} +#endif /* !CONFIG_UPROBES */ #endif /* _LINUX_UPROBES_H */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e56e56aa7535..b807d1566b64 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -30,9 +30,12 @@ #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ #include /* try_to_free_swap */ +#include /* user_enable_single_step */ +#include /* notifier mechanism */ #include +static struct srcu_struct uprobes_srcu; static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ @@ -486,6 +489,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) u = __insert_uprobe(uprobe); spin_unlock_irqrestore(&uprobes_treelock, flags); + /* For now assume that the instruction need not be single-stepped */ + uprobe->flags |= UPROBE_SKIP_SSTEP; + return u; } @@ -523,6 +529,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) return uprobe; } +static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct uprobe_consumer *uc; + + if (!(uprobe->flags & UPROBE_RUN_HANDLER)) + return; + + down_read(&uprobe->consumer_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (!uc->filter || uc->filter(uc, current)) + uc->handler(uc, regs); + } + up_read(&uprobe->consumer_rwsem); +} + /* Returns the previous consumer */ static struct uprobe_consumer * consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) @@ -645,7 +666,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; - ret = arch_uprobes_analyze_insn(&uprobe->arch, mm); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); if (ret) return ret; @@ -662,10 +683,21 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true); } +/* + * There could be threads that have hit the breakpoint and are entering the + * notifier code and trying to acquire the uprobes_treelock. The thread + * calling delete_uprobe() that is removing the uprobe from the rb_tree can + * race with these threads and might acquire the uprobes_treelock compared + * to some of the breakpoint hit threads. In such a case, the breakpoint + * hit threads will not find the uprobe. The current unregistering thread + * waits till all other threads have hit a breakpoint, to acquire the + * uprobes_treelock before the uprobe is removed from the rbtree. + */ static void delete_uprobe(struct uprobe *uprobe) { unsigned long flags; + synchronize_srcu(&uprobes_srcu); spin_lock_irqsave(&uprobes_treelock, flags); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock_irqrestore(&uprobes_treelock, flags); @@ -1010,6 +1042,288 @@ int uprobe_mmap(struct vm_area_struct *vma) return ret; } +/** + * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs + * @regs: Reflects the saved state of the task after it has hit a breakpoint + * instruction. + * Return the address of the breakpoint instruction. + */ +unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; +} + +/* + * Called with no locks held. + * Called in context of a exiting or a exec-ing thread. + */ +void uprobe_free_utask(struct task_struct *t) +{ + struct uprobe_task *utask = t->utask; + + if (t->uprobe_srcu_id != -1) + srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); + + if (!utask) + return; + + if (utask->active_uprobe) + put_uprobe(utask->active_uprobe); + + kfree(utask); + t->utask = NULL; +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t) +{ + t->utask = NULL; + t->uprobe_srcu_id = -1; +} + +/* + * Allocate a uprobe_task object for the task. + * Called when the thread hits a breakpoint for the first time. + * + * Returns: + * - pointer to new uprobe_task on success + * - NULL otherwise + */ +static struct uprobe_task *add_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof *utask, GFP_KERNEL); + if (unlikely(!utask)) + return NULL; + + utask->active_uprobe = NULL; + current->utask = utask; + return utask; +} + +/* Prepare to single-step probed instruction out of line. */ +static int +pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) +{ + return -EFAULT; +} + +/* + * If we are singlestepping, then ensure this thread is not connected to + * non-fatal signals until completion of singlestep. When xol insn itself + * triggers the signal, restart the original insn even if the task is + * already SIGKILL'ed (since coredump should report the correct ip). This + * is even more important if the task has a handler for SIGSEGV/etc, The + * _same_ instruction should be repeated again after return from the signal + * handler, and SSTEP can never finish in this case. + */ +bool uprobe_deny_signal(void) +{ + struct task_struct *t = current; + struct uprobe_task *utask = t->utask; + + if (likely(!utask || !utask->active_uprobe)) + return false; + + WARN_ON_ONCE(utask->state != UTASK_SSTEP); + + if (signal_pending(t)) { + spin_lock_irq(&t->sighand->siglock); + clear_tsk_thread_flag(t, TIF_SIGPENDING); + spin_unlock_irq(&t->sighand->siglock); + + if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { + utask->state = UTASK_SSTEP_TRAPPED; + set_tsk_thread_flag(t, TIF_UPROBE); + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + } + } + + return true; +} + +/* + * Avoid singlestepping the original instruction if the original instruction + * is a NOP or can be emulated. + */ +static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) +{ + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) + return true; + + uprobe->flags &= ~UPROBE_SKIP_SSTEP; + return false; +} + +/* + * Run handler and ask thread to singlestep. + * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. + */ +static void handle_swbp(struct pt_regs *regs) +{ + struct vm_area_struct *vma; + struct uprobe_task *utask; + struct uprobe *uprobe; + struct mm_struct *mm; + unsigned long bp_vaddr; + + uprobe = NULL; + bp_vaddr = uprobe_get_swbp_addr(regs); + mm = current->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, bp_vaddr); + + if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { + struct inode *inode; + loff_t offset; + + inode = vma->vm_file->f_mapping->host; + offset = bp_vaddr - vma->vm_start; + offset += (vma->vm_pgoff << PAGE_SHIFT); + uprobe = find_uprobe(inode, offset); + } + + srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); + current->uprobe_srcu_id = -1; + up_read(&mm->mmap_sem); + + if (!uprobe) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + return; + } + + utask = current->utask; + if (!utask) { + utask = add_utask(); + /* Cannot allocate; re-execute the instruction. */ + if (!utask) + goto cleanup_ret; + } + utask->active_uprobe = uprobe; + handler_chain(uprobe, regs); + if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) + goto cleanup_ret; + + utask->state = UTASK_SSTEP; + if (!pre_ssout(uprobe, regs, bp_vaddr)) { + user_enable_single_step(current); + return; + } + +cleanup_ret: + if (utask) { + utask->active_uprobe = NULL; + utask->state = UTASK_RUNNING; + } + if (uprobe) { + if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) + + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); + + put_uprobe(uprobe); + } +} + +/* + * Perform required fix-ups and disable singlestep. + * Allow pending signals to take effect. + */ +static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) +{ + struct uprobe *uprobe; + + uprobe = utask->active_uprobe; + if (utask->state == UTASK_SSTEP_ACK) + arch_uprobe_post_xol(&uprobe->arch, regs); + else if (utask->state == UTASK_SSTEP_TRAPPED) + arch_uprobe_abort_xol(&uprobe->arch, regs); + else + WARN_ON_ONCE(1); + + put_uprobe(uprobe); + utask->active_uprobe = NULL; + utask->state = UTASK_RUNNING; + user_disable_single_step(current); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* see uprobe_deny_signal() */ + spin_unlock_irq(¤t->sighand->siglock); +} + +/* + * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag. (and on + * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and + * allows the thread to return from interrupt. + * + * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and + * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from + * interrupt. + * + * While returning to userspace, thread notices the TIF_UPROBE flag and calls + * uprobe_notify_resume(). + */ +void uprobe_notify_resume(struct pt_regs *regs) +{ + struct uprobe_task *utask; + + utask = current->utask; + if (!utask || utask->state == UTASK_BP_HIT) + handle_swbp(regs); + else + handle_singlestep(utask, regs); +} + +/* + * uprobe_pre_sstep_notifier gets called from interrupt context as part of + * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. + */ +int uprobe_pre_sstep_notifier(struct pt_regs *regs) +{ + struct uprobe_task *utask; + + if (!current->mm) + return 0; + + utask = current->utask; + if (utask) + utask->state = UTASK_BP_HIT; + + set_thread_flag(TIF_UPROBE); + current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); + + return 1; +} + +/* + * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier + * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. + */ +int uprobe_post_sstep_notifier(struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (!current->mm || !utask || !utask->active_uprobe) + /* task is currently not uprobed */ + return 0; + + utask->state = UTASK_SSTEP_ACK; + set_thread_flag(TIF_UPROBE); + return 1; +} + +static struct notifier_block uprobe_exception_nb = { + .notifier_call = arch_uprobe_exception_notify, + .priority = INT_MAX-1, /* notified after kprobes, kgdb */ +}; + static int __init init_uprobes(void) { int i; @@ -1018,12 +1332,13 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mutex[i]); mutex_init(&uprobes_mmap_mutex[i]); } - return 0; + init_srcu_struct(&uprobes_srcu); + + return register_die_notifier(&uprobe_exception_nb); } +module_init(init_uprobes); static void __exit exit_uprobes(void) { } - -module_init(init_uprobes); module_exit(exit_uprobes); diff --git a/kernel/fork.c b/kernel/fork.c index e2cd3e2a5ae8..eb7b63334009 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include @@ -701,6 +702,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) exit_pi_state_list(tsk); #endif + uprobe_free_utask(tsk); + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); @@ -1295,6 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif + uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ diff --git a/kernel/signal.c b/kernel/signal.c index 8511e39813c7..e93ff0a719a0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -29,6 +29,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2192,6 +2193,9 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct signal_struct *signal = current->signal; int signr; + if (unlikely(uprobe_deny_signal())) + return 0; + relock: /* * We'll jump back here after any time we were stopped in TASK_STOPPED. -- cgit v1.2.3 From a6fca40f1d7f3e232c9de27c1cebbb9f787fbc4f Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 22 Mar 2012 17:01:25 -0700 Subject: x86, tlb: Switch cr3 in leave_mm() only when needed Currently leave_mm() unconditionally switches the cr3 to swapper_pg_dir. But there is no need to change the cr3, if we already left that mm. intel_idle() for example calls leave_mm() on every deep c-state entry where the CPU flushes the TLB for us. Similarly flush_tlb_all() was also calling leave_mm() whenever the TLB is in LAZY state. Both these paths will be improved with this change. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1332460885.16101.147.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/tlb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index d6c0418c3e47..125bcad1b757 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -61,11 +61,13 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); */ void leave_mm(int cpu) { + struct mm_struct *active_mm = percpu_read(cpu_tlbstate.active_mm); if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); - cpumask_clear_cpu(cpu, - mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); - load_cr3(swapper_pg_dir); + if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { + cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); + load_cr3(swapper_pg_dir); + } } EXPORT_SYMBOL_GPL(leave_mm); -- cgit v1.2.3 From 2e064b1e131eba262c0ba4268cb79dbc72edeece Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Fri, 23 Mar 2012 09:35:04 -0700 Subject: x86, efi: Fix issue of overlapping .reloc section for EFI_STUB Previously the .reloc section was embedded in the .text section. No relocations are required during the PE/COFF loading phase for the kernel using the EFI_STUB UEFI loader. To fix the issue of overlapping sections, create a .reloc section with a zero length. The .reloc section header must exist to make sure the image will be loaded by the UEFI firmware, but a zero-length section header seems to be sufficient. Signed-off-by: Jordan Justen Link: http://lkml.kernel.org/r/1332520506-6472-2-git-send-email-jordan.l.justen@intel.com Acked-by: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index f1bbeeb09148..4e9124b148c2 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -217,18 +217,17 @@ section_table: # # The EFI application loader requires a relocation section - # because EFI applications are relocatable and not having - # this section seems to confuse it. But since we don't need - # the loader to fixup any relocs for us just fill it with a - # single dummy reloc. + # because EFI applications must be relocatable. But since + # we don't need the loader to fixup any relocs for us, we + # just create an empty (zero-length) .reloc section header. # .ascii ".reloc" .byte 0 .byte 0 - .long reloc_end - reloc_start - .long reloc_start - .long reloc_end - reloc_start # SizeOfRawData - .long reloc_start # PointerToRawData + .long 0 + .long 0 + .long 0 # SizeOfRawData + .long 0 # PointerToRawData .long 0 # PointerToRelocations .long 0 # PointerToLineNumbers .word 0 # NumberOfRelocations @@ -469,10 +468,3 @@ setup_corrupt: .data dummy: .long 0 - - .section .reloc -reloc_start: - .long dummy - reloc_start - .long 10 - .word 0 -reloc_end: -- cgit v1.2.3 From e31be363df3092821bf179cf4baa076f501b8ae6 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Mar 2012 09:35:05 -0700 Subject: x86, efi: Fix .text section overlapping image header for EFI_STUB This change modifes the PE .text section to start after the first sector of the kernel image. The header may be modified by the UEFI secure boot signing, so it is not appropriate for it to be included in one of the image sections. Since the sections are part of the secure boot hash, this modification to the .text section contents would invalidate the secure boot signed hash. Note: UEFI secure boot does hash the image header, but fields that are changed by the signing process are excluded from the hash calculation. This exclusion process is only handled for the image header, and not image sections. Luckily, we can still easily boot without the first sector by initializing a few fields in arch/x86/boot/compressed/eboot.c. Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1332520506-6472-3-git-send-email-jordan.l.justen@intel.com [jordan.l.justen@intel.com: set .text vma & file offset] Signed-off-by: Jordan Justen Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 14 +++++++++++--- arch/x86/boot/header.S | 2 +- arch/x86/boot/tools/build.c | 25 ++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index fec216f4fbc3..01cbb8707a31 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -904,11 +904,19 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) memset(boot_params, 0x0, 0x4000); - /* Copy first two sectors to boot_params */ - memcpy(boot_params, image->image_base, 1024); - hdr = &boot_params->hdr; + /* Copy the second sector to boot_params */ + memcpy(&hdr->jump, image->image_base + 512, 512); + + /* + * Fill out some of the header fields ourselves because the + * EFI firmware loader doesn't load the first sector. + */ + hdr->root_flags = 1; + hdr->vid_mode = 0xffff; + hdr->boot_flag = 0xAA55; + /* * The EFI firmware loader could have placed the kernel image * anywhere in memory, but the kernel has various restrictions diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 4e9124b148c2..4ceb56e9a4ce 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -147,7 +147,7 @@ optional_header: # Filled in by build.c .long 0x0000 # AddressOfEntryPoint - .long 0x0000 # BaseOfCode + .long 0x0200 # BaseOfCode #ifdef CONFIG_X86_32 .long 0 # data #endif diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 4e9bd6bcafa6..2aeab3dc9e5f 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -202,12 +202,19 @@ int main(int argc, char ** argv) pe_header = *(unsigned int *)&buf[0x3c]; - /* Size of code */ - *(unsigned int *)&buf[pe_header + 0x1c] = file_sz; - /* Size of image */ *(unsigned int *)&buf[pe_header + 0x50] = file_sz; + /* + * Subtract the size of the first section (512 bytes) which + * includes the header and .reloc section. The remaining size + * is that of the .text section. + */ + file_sz -= 512; + + /* Size of code */ + *(unsigned int *)&buf[pe_header + 0x1c] = file_sz; + #ifdef CONFIG_X86_32 /* Address of entry point */ *(unsigned int *)&buf[pe_header + 0x28] = i; @@ -215,8 +222,14 @@ int main(int argc, char ** argv) /* .text size */ *(unsigned int *)&buf[pe_header + 0xb0] = file_sz; + /* .text vma */ + *(unsigned int *)&buf[pe_header + 0xb4] = 0x200; + /* .text size of initialised data */ *(unsigned int *)&buf[pe_header + 0xb8] = file_sz; + + /* .text file offset */ + *(unsigned int *)&buf[pe_header + 0xbc] = 0x200; #else /* * Address of entry point. startup_32 is at the beginning and @@ -228,8 +241,14 @@ int main(int argc, char ** argv) /* .text size */ *(unsigned int *)&buf[pe_header + 0xc0] = file_sz; + /* .text vma */ + *(unsigned int *)&buf[pe_header + 0xc4] = 0x200; + /* .text size of initialised data */ *(unsigned int *)&buf[pe_header + 0xc8] = file_sz; + + /* .text file offset */ + *(unsigned int *)&buf[pe_header + 0xcc] = 0x200; #endif /* CONFIG_X86_32 */ #endif /* CONFIG_EFI_STUB */ -- cgit v1.2.3 From e47bb0bda46bf50f81671db502d0c903e0a32604 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Mar 2012 09:35:06 -0700 Subject: x86, efi: Fix NumberOfRvaAndSizes field in PE32 header for EFI_STUB We've actually got six data directories in the header, not one. Even though the firmware loader doesn't seem to mind, when we come to sign the kernel image the signing tool thinks that there is no Certificate Table data directory, even though we've allocated space for one. Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1332520506-6472-4-git-send-email-jordan.l.justen@intel.com Reviewed-by: Jordan Justen Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 4ceb56e9a4ce..8bbea6aa40d9 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -189,7 +189,7 @@ extra_header_fields: .quad 0 # SizeOfHeapCommit #endif .long 0 # LoaderFlags - .long 0x1 # NumberOfRvaAndSizes + .long 0x6 # NumberOfRvaAndSizes .quad 0 # ExportTable .quad 0 # ImportTable -- cgit v1.2.3 From 35372a7d45291140a97518a8d1c8cb0e31ee2bb7 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 30 Mar 2012 01:38:03 +0200 Subject: x86: spinlock.h: Remove REG_PTR_MODE REG_PTR_MODE has no users at all. Signed-off-by: Richard Weinberger Link: http://lkml.kernel.org/r/1333064283-3109-1-git-send-email-richard@nod.at Acked-by: Acked-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/spinlock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 76bfa2cf301d..b315a33867f2 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -20,10 +20,8 @@ #ifdef CONFIG_X86_32 # define LOCK_PTR_REG "a" -# define REG_PTR_MODE "k" #else # define LOCK_PTR_REG "D" -# define REG_PTR_MODE "q" #endif #if defined(CONFIG_X86_32) && \ -- cgit v1.2.3 From 3f3aaea29ff7ee2d43b430338427f30ba7f60ff9 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 11:45:01 -0400 Subject: xen/p2m: Move code around to allow for better re-usage. We are going to be using the early_alloc_p2m (and early_alloc_p2m_middle) code in follow up patches which are not related to setting identity pages. Hence lets move the code out in its own function and rename them as appropiate. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 62 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 1b267e75158d..3cc3afeb09a1 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init __early_alloc_p2m(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx, mididx, idx; @@ -541,6 +541,36 @@ static bool __init __early_alloc_p2m(unsigned long pfn) } return idx != 0; } + +static bool __init early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx = p2m_top_index(pfn); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don't set mid_mfn_p[midix] here, + * look in early_alloc_p2m_middle */ + } + return true; +} unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { @@ -559,35 +589,11 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; - unsigned long **mid; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - if (mid == p2m_mid_missing) { - mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don't set mid_mfn_p[midix] here, - * look in __early_alloc_p2m */ - } + WARN_ON(!early_alloc_p2m(pfn)); } - __early_alloc_p2m(pfn_s); - __early_alloc_p2m(pfn_e); + early_alloc_p2m_middle(pfn_s); + early_alloc_p2m_middle(pfn_e); for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) -- cgit v1.2.3 From cef4cca551d652b7f69c9d76337c5fae24e069dc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 14:15:14 -0400 Subject: xen/p2m: Allow alloc_p2m_middle to call reserve_brk depending on argument For identity cases we want to call reserve_brk only on the boundary conditions of the middle P2M (so P2M[x][y][0] = extend_brk). This is to work around identify regions (PCI spaces, gaps in E820) which are not aligned on 2MB regions. However for the case were we want to allocate P2M middle leafs at the early bootup stage, irregardless of this alignment check we need some means of doing that. For that we provide the new argument. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 3cc3afeb09a1..8b3a3958d120 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init early_alloc_p2m_middle(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; @@ -508,7 +508,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) idx = p2m_index(pfn); /* Pfff.. No boundary cross-over, lets get out. */ - if (!idx) + if (!idx && check_boundary) return false; WARN(p2m_top[topidx][mididx] == p2m_identity, @@ -531,7 +531,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) p2m_top[topidx][mididx] = p2m; /* For save/restore we need to MFN of the P2M saved */ - + mid_mfn_p = p2m_top_mfn_p[topidx]; WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", @@ -592,8 +592,8 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, WARN_ON(!early_alloc_p2m(pfn)); } - early_alloc_p2m_middle(pfn_s); - early_alloc_p2m_middle(pfn_e); + early_alloc_p2m_middle(pfn_s, true); + early_alloc_p2m_middle(pfn_e, true); for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) -- cgit v1.2.3 From d5096850b47424fb0f1c6a75b8f7184f7169319a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 14:16:49 -0400 Subject: xen/p2m: Collapse early_alloc_p2m_middle redundant checks. At the start of the function we were checking for idx != 0 and bailing out. And later calling extend_brk if idx != 0. That is unnecessary so remove that checks. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b3a3958d120..952edefcedb3 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -502,6 +502,8 @@ static bool alloc_p2m(unsigned long pfn) static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; + unsigned long *p2m; + unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -522,24 +524,21 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary return false; /* Boundary cross-over for the edges: */ - if (idx) { - unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - unsigned long *mid_mfn_p; + p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m); + p2m_init(p2m); - p2m_top[topidx][mididx] = p2m; + p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ + /* For save/restore we need to MFN of the P2M saved */ - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); - } - return idx != 0; + return true; } static bool __init early_alloc_p2m(unsigned long pfn) -- cgit v1.2.3 From 940713bb2ce3033f468a220094a07250a2f69bdd Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 14:33:14 -0400 Subject: xen/p2m: An early bootup variant of set_phys_to_machine During early bootup we can't use alloc_page, so to allocate leaf pages in the P2M we need to use extend_brk. For that we are utilizing the early_alloc_p2m and early_alloc_p2m_middle functions to do the job for us. This function follows the same logic as set_phys_to_machine. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 1 + arch/x86/xen/p2m.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c34f96c2f7a0..93971e841dd5 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -44,6 +44,7 @@ extern unsigned long machine_to_phys_nr; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 952edefcedb3..ffd08c414e91 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -570,6 +570,21 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } +bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!early_alloc_p2m(pfn)) + return false; + + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { -- cgit v1.2.3 From 83c529151ab0d4a813e3f6a3e293fff75d468519 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Tue, 28 Feb 2012 05:15:46 +0000 Subject: KVM: x86: expose Intel cpu new features (HLE, RTM) to guest Intel recently release 2 new features, HLE and RTM. Refer to http://software.intel.com/file/41417. This patch expose them to guest. Signed-off-by: Liu, Jinsong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 9fed5bedaad6..c2134b881033 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -247,7 +247,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | F(RTM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.2.3 From 675acb758ab2381c72fe3ceb5c091cbd0879d4dd Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 8 Mar 2012 18:07:56 +0800 Subject: KVM: SVM: count all irq windows exit Also count the exits of fast-path. Signed-off-by: Jason Wang Acked-by: Joerg Roedel Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e334389e1c75..f3167208562e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3240,6 +3240,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); + ++svm->vcpu.stat.irq_window_exits; /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -3247,7 +3248,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm) if (!irqchip_in_kernel(svm->vcpu.kvm) && kvm_run->request_interrupt_window && !kvm_cpu_has_interrupt(&svm->vcpu)) { - ++svm->vcpu.stat.irq_window_exits; kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } -- cgit v1.2.3 From b6d33834bd4e8bdf4a199812e31b3e36da53c794 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Thu, 8 Mar 2012 16:44:24 -0500 Subject: KVM: Factor out kvm_vcpu_kick to arch-generic code The kvm_vcpu_kick function performs roughly the same funcitonality on most all architectures, so we shouldn't have separate copies. PowerPC keeps a pointer to interchanging waitqueues on the vcpu_arch structure and to accomodate this special need a __KVM_HAVE_ARCH_VCPU_GET_WQ define and accompanying function kvm_arch_vcpu_wq have been defined. For all other architectures this is a generic inline that just returns &vcpu->wq; Acked-by: Scott Wood Signed-off-by: Christoffer Dall Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 1 + arch/ia64/kvm/kvm-ia64.c | 20 +++++--------------- arch/powerpc/include/asm/kvm_host.h | 6 ++++++ arch/powerpc/kvm/powerpc.c | 21 ++++++--------------- arch/s390/kvm/kvm-s390.c | 8 ++++++++ arch/x86/kvm/x86.c | 16 ++-------------- include/linux/kvm_host.h | 9 +++++++++ virt/kvm/kvm_main.c | 22 ++++++++++++++++++++++ 8 files changed, 59 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index e35b3a84a40b..c4b4bac3d09e 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -365,6 +365,7 @@ struct thash_cb { }; struct kvm_vcpu_stat { + u32 halt_wakeup; }; struct kvm_vcpu_arch { diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index f5104b7c52cd..9d80ff8d9eff 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1872,21 +1872,6 @@ void kvm_arch_hardware_unsetup(void) { } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int me; - int cpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - - me = get_cpu(); - if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu)) - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) - smp_send_reschedule(cpu); - put_cpu(); -} - int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { return __apic_accept_irq(vcpu, irq->vector); @@ -1956,6 +1941,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) (kvm_highest_pending_irq(vcpu) != -1); } +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)); +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 52eb9c1f4fe0..889383735e73 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -498,4 +498,10 @@ struct kvm_vcpu_arch { #define KVM_MMIO_REG_QPR 0x0040 #define KVM_MMIO_REG_FQPR 0x0060 +#define __KVM_HAVE_ARCH_VCPU_GET_WQ 1 +static inline wait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.wqp; +} + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 00d7e345b3fe..b5e9046462fd 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -43,6 +43,11 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) v->requests; } +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return 1; +} + int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) { int nr = kvmppc_get_gpr(vcpu, 11); @@ -588,21 +593,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return r; } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) -{ - int me; - int cpu = vcpu->cpu; - - me = get_cpu(); - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } else if (cpu != me && cpu != -1) { - smp_send_reschedule(vcpu->cpu); - } - put_cpu(); -} - int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { @@ -611,6 +601,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) } kvmppc_core_queue_external(vcpu, irq); + kvm_vcpu_kick(vcpu); return 0; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 217ce44395a4..d30c8350b949 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -423,6 +423,14 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return 0; } +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + /* kvm common code refers to this, but never calls it */ + BUG(); + return 0; +} + + static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) { kvm_s390_vcpu_initial_reset(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4044ce0bf7c1..511031dcb9cc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6403,21 +6403,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) kvm_cpu_has_interrupt(vcpu)); } -void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { - int me; - int cpu = vcpu->cpu; - - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); - ++vcpu->stat.halt_wakeup; - } - - me = get_cpu(); - if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) - smp_send_reschedule(cpu); - put_cpu(); + return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3a2cea616283..5b624e1ff814 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -439,6 +439,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_vcpu_kick(struct kvm_vcpu *vcpu); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); void kvm_resched(struct kvm_vcpu *vcpu); void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); @@ -507,6 +508,7 @@ int kvm_arch_hardware_setup(void); void kvm_arch_hardware_unsetup(void); void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); @@ -522,6 +524,13 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) } #endif +#ifndef __KVM_HAVE_ARCH_VCPU_GET_WQ +static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu) +{ + return &vcpu->wq; +} +#endif + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type); void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a9565e240636..7149a2e65524 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1514,6 +1514,28 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); } +/* + * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. + */ +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + wait_queue_head_t *wqp; + + wqp = kvm_arch_vcpu_wq(vcpu); + if (waitqueue_active(wqp)) { + wake_up_interruptible(wqp); + ++vcpu->stat.halt_wakeup; + } + + me = get_cpu(); + if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) + if (kvm_arch_vcpu_should_kick(vcpu)) + smp_send_reschedule(cpu); + put_cpu(); +} + void kvm_resched(struct kvm_vcpu *vcpu) { if (!need_resched()) -- cgit v1.2.3 From eae3ee7d8a7c59cf63441dedf28674889f5fc477 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Sat, 10 Mar 2012 14:37:25 -0500 Subject: x86: pvclock: Add flag to indicate that a vm was stopped by the host This flag will be used to check if the vm was stopped by the host when a soft lockup was detected. The host will set the flag when it stops the guest. On resume, the guest will check this flag if a soft lockup is detected and skip issuing the warning. Signed-off-by: Eric B Munson Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/pvclock-abi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 35f2d1948ada..6167fd798188 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -40,5 +40,6 @@ struct pvclock_wall_clock { } __attribute__((__packed__)); #define PVCLOCK_TSC_STABLE_BIT (1 << 0) +#define PVCLOCK_GUEST_STOPPED (1 << 1) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ -- cgit v1.2.3 From 3b5d56b9317fa7b5407dff1aa7b115bf6cdbd494 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Sat, 10 Mar 2012 14:37:26 -0500 Subject: kvmclock: Add functions to check if the host has stopped the vm When a host stops or suspends a VM it will set a flag to show this. The watchdog will use these functions to determine if a softlockup is real, or the result of a suspended VM. Signed-off-by: Eric B Munson asm-generic changes Acked-by: Arnd Bergmann Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/alpha/include/asm/kvm_para.h | 1 + arch/arm/include/asm/kvm_para.h | 1 + arch/avr32/include/asm/kvm_para.h | 1 + arch/blackfin/include/asm/kvm_para.h | 1 + arch/c6x/include/asm/kvm_para.h | 1 + arch/frv/include/asm/kvm_para.h | 1 + arch/h8300/include/asm/kvm_para.h | 1 + arch/hexagon/include/asm/kvm_para.h | 1 + arch/ia64/include/asm/kvm_para.h | 5 +++++ arch/m68k/include/asm/kvm_para.h | 1 + arch/microblaze/include/asm/kvm_para.h | 1 + arch/mips/include/asm/kvm_para.h | 1 + arch/mn10300/include/asm/kvm_para.h | 1 + arch/openrisc/include/asm/kvm_para.h | 1 + arch/parisc/include/asm/kvm_para.h | 1 + arch/powerpc/include/asm/kvm_para.h | 5 +++++ arch/s390/include/asm/kvm_para.h | 5 +++++ arch/score/include/asm/kvm_para.h | 1 + arch/sh/include/asm/kvm_para.h | 1 + arch/sparc/include/asm/kvm_para.h | 1 + arch/tile/include/asm/kvm_para.h | 1 + arch/um/include/asm/kvm_para.h | 1 + arch/unicore32/include/asm/kvm_para.h | 1 + arch/x86/include/asm/kvm_para.h | 8 ++++++++ arch/x86/kernel/kvmclock.c | 21 +++++++++++++++++++++ arch/xtensa/include/asm/kvm_para.h | 1 + include/asm-generic/kvm_para.h | 14 ++++++++++++++ 27 files changed, 79 insertions(+) create mode 100644 arch/alpha/include/asm/kvm_para.h create mode 100644 arch/arm/include/asm/kvm_para.h create mode 100644 arch/avr32/include/asm/kvm_para.h create mode 100644 arch/blackfin/include/asm/kvm_para.h create mode 100644 arch/c6x/include/asm/kvm_para.h create mode 100644 arch/frv/include/asm/kvm_para.h create mode 100644 arch/h8300/include/asm/kvm_para.h create mode 100644 arch/hexagon/include/asm/kvm_para.h create mode 100644 arch/m68k/include/asm/kvm_para.h create mode 100644 arch/microblaze/include/asm/kvm_para.h create mode 100644 arch/mips/include/asm/kvm_para.h create mode 100644 arch/mn10300/include/asm/kvm_para.h create mode 100644 arch/openrisc/include/asm/kvm_para.h create mode 100644 arch/parisc/include/asm/kvm_para.h create mode 100644 arch/score/include/asm/kvm_para.h create mode 100644 arch/sh/include/asm/kvm_para.h create mode 100644 arch/sparc/include/asm/kvm_para.h create mode 100644 arch/tile/include/asm/kvm_para.h create mode 100644 arch/um/include/asm/kvm_para.h create mode 100644 arch/unicore32/include/asm/kvm_para.h create mode 100644 arch/xtensa/include/asm/kvm_para.h create mode 100644 include/asm-generic/kvm_para.h (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/kvm_para.h b/arch/alpha/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/alpha/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/arm/include/asm/kvm_para.h b/arch/arm/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/arm/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/avr32/include/asm/kvm_para.h b/arch/avr32/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/avr32/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/blackfin/include/asm/kvm_para.h b/arch/blackfin/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/blackfin/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/c6x/include/asm/kvm_para.h b/arch/c6x/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/c6x/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/frv/include/asm/kvm_para.h b/arch/frv/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/frv/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/h8300/include/asm/kvm_para.h b/arch/h8300/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/h8300/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/hexagon/include/asm/kvm_para.h b/arch/hexagon/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/hexagon/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h index 1588aee781a2..2019cb99335e 100644 --- a/arch/ia64/include/asm/kvm_para.h +++ b/arch/ia64/include/asm/kvm_para.h @@ -26,6 +26,11 @@ static inline unsigned int kvm_arch_para_features(void) return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif #endif diff --git a/arch/m68k/include/asm/kvm_para.h b/arch/m68k/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/m68k/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/microblaze/include/asm/kvm_para.h b/arch/microblaze/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/microblaze/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/mips/include/asm/kvm_para.h b/arch/mips/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/mips/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/mn10300/include/asm/kvm_para.h b/arch/mn10300/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/mn10300/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/openrisc/include/asm/kvm_para.h b/arch/openrisc/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/openrisc/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/parisc/include/asm/kvm_para.h b/arch/parisc/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/parisc/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index 7b754e743003..c18916bff689 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -206,6 +206,11 @@ static inline unsigned int kvm_arch_para_features(void) return r; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif /* __KERNEL__ */ #endif /* __POWERPC_KVM_PARA_H__ */ diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index 6964db226f83..a98832961035 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h @@ -149,6 +149,11 @@ static inline unsigned int kvm_arch_para_features(void) return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif #endif /* __S390_KVM_PARA_H */ diff --git a/arch/score/include/asm/kvm_para.h b/arch/score/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/score/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/sh/include/asm/kvm_para.h b/arch/sh/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/sh/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/sparc/include/asm/kvm_para.h b/arch/sparc/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/sparc/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/tile/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/um/include/asm/kvm_para.h b/arch/um/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/um/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/unicore32/include/asm/kvm_para.h b/arch/unicore32/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/unicore32/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 734c3767cfac..99c4bbe0cca2 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -95,6 +95,14 @@ struct kvm_vcpu_pv_apf_data { extern void kvmclock_init(void); extern int kvm_register_clock(char *txt); +#ifdef CONFIG_KVM_CLOCK +bool kvm_check_and_clear_guest_paused(void); +#else +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} +#endif /* CONFIG_KVMCLOCK */ /* This instruction is vmcall. On non-VT architectures, it will generate a * trap that we will then rewrite to the appropriate instruction. diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f8492da65bfc..4ba090ca689d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -114,6 +115,26 @@ static void kvm_get_preset_lpj(void) preset_lpj = lpj; } +bool kvm_check_and_clear_guest_paused(void) +{ + bool ret = false; + struct pvclock_vcpu_time_info *src; + + /* + * per_cpu() is safe here because this function is only called from + * timer functions where preemption is already disabled. + */ + WARN_ON(!in_atomic()); + src = &__get_cpu_var(hv_clock); + if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { + __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + ret = true; + } + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused); + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, diff --git a/arch/xtensa/include/asm/kvm_para.h b/arch/xtensa/include/asm/kvm_para.h new file mode 100644 index 000000000000..14fab8f0b957 --- /dev/null +++ b/arch/xtensa/include/asm/kvm_para.h @@ -0,0 +1 @@ +#include diff --git a/include/asm-generic/kvm_para.h b/include/asm-generic/kvm_para.h new file mode 100644 index 000000000000..05ef7e705939 --- /dev/null +++ b/include/asm-generic/kvm_para.h @@ -0,0 +1,14 @@ +#ifndef _ASM_GENERIC_KVM_PARA_H +#define _ASM_GENERIC_KVM_PARA_H + + +/* + * This function is used by architectures that support kvm to avoid issuing + * false soft lockup messages. + */ +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + +#endif -- cgit v1.2.3 From 1c0b28c2a46d98cd258d96b8c222144b22876c46 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Sat, 10 Mar 2012 14:37:27 -0500 Subject: KVM: x86: Add ioctl for KVM_KVMCLOCK_CTRL Now that we have a flag that will tell the guest it was suspended, create an interface for that communication using a KVM ioctl. Signed-off-by: Eric B Munson Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 20 ++++++++++++++++++++ Documentation/virtual/kvm/msr.txt | 4 ++++ arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ include/linux/kvm.h | 3 +++ 4 files changed, 49 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6386f8c0482e..81ff39f6248d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1669,6 +1669,26 @@ at the memory location pointed to by "addr". The list of registers accessible using this interface is identical to the list in 4.64. +4.70 KVM_KVMCLOCK_CTRL + +Capability: KVM_CAP_KVMCLOCK_CTRL +Architectures: Any that implement pvclocks (currently x86 only) +Type: vcpu ioctl +Parameters: None +Returns: 0 on success, -1 on error + +This signals to the host kernel that the specified guest is being paused by +userspace. The host will set a flag in the pvclock structure that is checked +from the soft lockup watchdog. The flag is part of the pvclock structure that +is shared between guest and host, specifically the second bit of the flags +field of the pvclock_vcpu_time_info structure. It will be set exclusively by +the host and read/cleared exclusively by the guest. The guest operation of +checking and clearing the flag must an atomic operation so +load-link/store-conditional, or equivalent must be used. There are two cases +where the guest will clear the flag: when the soft lockup watchdog timer resets +itself or when a soft lockup is detected. This ioctl can be called any time +after pausing the vcpu, but before it is resumed. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 50317809113d..96b41bd97523 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -108,6 +108,10 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 | | time measures taken across 0 | 24 | multiple cpus are guaranteed to | | be monotonic + ------------------------------------------------------------- + | | guest vcpu has been paused by + 1 | N/A | the host + | | See 4.70 in api.txt ------------------------------------------------------------- Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 511031dcb9cc..99b738028fc0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2147,6 +2147,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_PCI_2_3: + case KVM_CAP_KVMCLOCK_CTRL: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2597,6 +2598,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, return r; } +/* + * kvm_set_guest_paused() indicates to the guest kernel that it has been + * stopped by the hypervisor. This function will be called from the host only. + * EINVAL is returned when the host attempts to set the flag for a guest that + * does not support pv clocks. + */ +static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) +{ + struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock; + if (!vcpu->arch.time_page) + return -EINVAL; + src->flags |= PVCLOCK_GUEST_STOPPED; + mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2873,6 +2891,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = vcpu->arch.virtual_tsc_khz; goto out; } + case KVM_KVMCLOCK_CTRL: { + r = kvm_set_guest_paused(vcpu); + goto out; + } default: r = -EINVAL; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 6c322a90b92f..7a9dd4b3dede 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -589,6 +589,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_S390_UCONTROL 73 #define KVM_CAP_SYNC_REGS 74 #define KVM_CAP_PCI_2_3 75 +#define KVM_CAP_KVMCLOCK_CTRL 76 #ifdef KVM_CAP_IRQ_ROUTING @@ -859,6 +860,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_ONE_REG */ #define KVM_GET_ONE_REG _IOW(KVMIO, 0xab, struct kvm_one_reg) #define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) +/* VM is being stopped by host */ +#define KVM_KVMCLOCK_CTRL _IO(KVMIO, 0xad) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -- cgit v1.2.3 From 248997095d652576f1213028a95ca5fff85d089f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 15 Mar 2012 18:16:49 -0400 Subject: kvmclock: remove unneeded EXPORT macro check_and_clear_guest_paused does not need to be exported as it isn't used by any modules, remove the export. Signed-off-by: Eric B Munson Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 4ba090ca689d..086eb58c6e80 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -133,7 +133,6 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } -EXPORT_SYMBOL_GPL(kvm_check_and_clear_guest_paused); static struct clocksource kvm_clock = { .name = "kvm-clock", -- cgit v1.2.3 From a0ed46073c14f66dbf0707aaa7588b78da83d7c6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 1 Mar 2012 19:31:22 +0900 Subject: KVM: MMU: Split the main body of rmap_write_protect() off from others We will use this in the following patch to implement another function which needs to write protect pages using the rmap information. Note that there is a small change in debug printing for large pages: we do not differentiate them from others to avoid duplicating code. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4cb164268846..c8b5694d1a48 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1010,42 +1010,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) +static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { - unsigned long *rmapp; - u64 *spte; - int i, write_protected = 0; + u64 *spte = NULL; + int write_protected = 0; - rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { + while ((spte = rmap_next(rmapp, spte))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); - if (is_writable_pte(*spte)) { + + if (!is_writable_pte(*spte)) + continue; + + if (level == PT_PAGE_TABLE_LEVEL) { mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); - write_protected = 1; + } else { + BUG_ON(!is_large_pte(*spte)); + drop_spte(kvm, spte); + --kvm->stat.lpages; + spte = NULL; } - spte = rmap_next(rmapp, spte); + + write_protected = 1; } - /* check for huge page mappings */ - for (i = PT_DIRECTORY_LEVEL; + return write_protected; +} + +int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, + struct kvm_memory_slot *slot) +{ + unsigned long *rmapp; + int i, write_protected = 0; + + for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON(!is_large_pte(*spte)); - pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); - if (is_writable_pte(*spte)) { - drop_spte(kvm, spte); - --kvm->stat.lpages; - spte = NULL; - write_protected = 1; - } - spte = rmap_next(rmapp, spte); - } + write_protected |= __rmap_write_protect(kvm, rmapp, i); } return write_protected; -- cgit v1.2.3 From 5dc99b2380d59b8aeafa98791f92b96400ed3187 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Thu, 1 Mar 2012 19:32:16 +0900 Subject: KVM: Avoid checking huge page mappings in get_dirty_log() Dropped such mappings when we enabled dirty logging and we will never create new ones until we stop the logging. For this we introduce a new function which can be used to write protect a range of PT level pages: although we do not need to care about a range of pages at this point, the following patch will need this feature to optimize the write protection of many pages. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/mmu.c | 40 ++++++++++++++++++++++++++++++---------- arch/x86/kvm/x86.c | 8 +++----- 3 files changed, 36 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e216ba066e79..f624ca72ea24 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -712,8 +712,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot); +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c8b5694d1a48..dc5f2459db6c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1037,27 +1037,47 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level return write_protected; } -int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, - struct kvm_memory_slot *slot) +/** + * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages + * @kvm: kvm instance + * @slot: slot to protect + * @gfn_offset: start of the BITS_PER_LONG pages we care about + * @mask: indicates which pages we should protect + * + * Used when we do not need to care about huge page mappings: e.g. during dirty + * logging we do not have any such mappings. + */ +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask) { unsigned long *rmapp; - int i, write_protected = 0; - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); - } + while (mask) { + rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); - return write_protected; + /* clear the first set bit */ + mask &= mask - 1; + } } static int rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; + unsigned long *rmapp; + int i; + int write_protected = 0; slot = gfn_to_memslot(kvm, gfn); - return kvm_mmu_rmap_write_protect(kvm, gfn, slot); + + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + rmapp = __gfn_to_rmap(gfn, i, slot); + write_protected |= __rmap_write_protect(kvm, rmapp, i); + } + + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 99b738028fc0..813ebf1e55a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3095,13 +3095,11 @@ static void write_protect_slot(struct kvm *kvm, /* Not many dirty pages compared to # of shadow pages. */ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - unsigned long gfn_offset; + gfn_t offset; - for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { - unsigned long gfn = memslot->base_gfn + gfn_offset; + for_each_set_bit(offset, dirty_bitmap, memslot->npages) + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1); - kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - } kvm_flush_remote_tlbs(kvm); } else kvm_mmu_slot_remove_write_access(kvm, memslot->id); -- cgit v1.2.3 From 60c34612b70711fb14a8dcbc6a79509902450d2e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 3 Mar 2012 14:21:48 +0900 Subject: KVM: Switch to srcu-less get_dirty_log() We have seen some problems of the current implementation of get_dirty_log() which uses synchronize_srcu_expedited() for updating dirty bitmaps; e.g. it is noticeable that this sometimes gives us ms order of latency when we use VGA displays. Furthermore the recent discussion on the following thread "srcu: Implement call_srcu()" http://lkml.org/lkml/2012/1/31/211 also motivated us to implement get_dirty_log() without SRCU. This patch achieves this goal without sacrificing the performance of both VGA and live migration: in practice the new code is much faster than the old one unless we have too many dirty pages. Implementation: The key part of the implementation is the use of xchg() operation for clearing dirty bits atomically. Since this allows us to update only BITS_PER_LONG pages at once, we need to iterate over the dirty bitmap until every dirty bit is cleared again for the next call. Although some people may worry about the problem of using the atomic memory instruction many times to the concurrently accessible bitmap, it is usually accessed with mmu_lock held and we rarely see concurrent accesses: so what we need to care about is the pure xchg() overheads. Another point to note is that we do not use for_each_set_bit() to check which ones in each BITS_PER_LONG pages are actually dirty. Instead we simply use __ffs() in a loop. This is much faster than repeatedly call find_next_bit(). Performance: The dirty-log-perf unit test showed nice improvements, some times faster than before, except for some extreme cases; for such cases the speed of getting dirty page information is much faster than we process it in the userspace. For real workloads, both VGA and live migration, we have observed pure improvements: when the guest was reading a file during live migration, we originally saw a few ms of latency, but with the new method the latency was less than 200us. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 116 ++++++++++++++++++++--------------------------------- 1 file changed, 43 insertions(+), 73 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 813ebf1e55a0..0d9a57875f0b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3067,55 +3067,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, } /** - * write_protect_slot - write protect a slot for dirty logging - * @kvm: the kvm instance - * @memslot: the slot we protect - * @dirty_bitmap: the bitmap indicating which pages are dirty - * @nr_dirty_pages: the number of dirty pages + * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot + * @kvm: kvm instance + * @log: slot id and address to which we copy the log * - * We have two ways to find all sptes to protect: - * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and - * checks ones that have a spte mapping a page in the slot. - * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. + * We need to keep it in mind that VCPU threads can write to the bitmap + * concurrently. So, to avoid losing data, we keep the following order for + * each bit: * - * Generally speaking, if there are not so many dirty pages compared to the - * number of shadow pages, we should use the latter. + * 1. Take a snapshot of the bit and clear it if needed. + * 2. Write protect the corresponding page. + * 3. Flush TLB's if needed. + * 4. Copy the snapshot to the userspace. * - * Note that letting others write into a page marked dirty in the old bitmap - * by using the remaining tlb entry is not a problem. That page will become - * write protected again when we flush the tlb and then be reported dirty to - * the user space by copying the old bitmap. + * Between 2 and 3, the guest may write to the page using the remaining TLB + * entry. This is not a problem because the page will be reported dirty at + * step 4 using the snapshot taken before and step 3 ensures that successive + * writes will be logged for the next call. */ -static void write_protect_slot(struct kvm *kvm, - struct kvm_memory_slot *memslot, - unsigned long *dirty_bitmap, - unsigned long nr_dirty_pages) -{ - spin_lock(&kvm->mmu_lock); - - /* Not many dirty pages compared to # of shadow pages. */ - if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { - gfn_t offset; - - for_each_set_bit(offset, dirty_bitmap, memslot->npages) - kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, 1); - - kvm_flush_remote_tlbs(kvm); - } else - kvm_mmu_slot_remove_write_access(kvm, memslot->id); - - spin_unlock(&kvm->mmu_lock); -} - -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { int r; struct kvm_memory_slot *memslot; - unsigned long n, nr_dirty_pages; + unsigned long n, i; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + bool is_dirty = false; mutex_lock(&kvm->slots_lock); @@ -3124,49 +3101,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, goto out; memslot = id_to_memslot(kvm->memslots, log->slot); + + dirty_bitmap = memslot->dirty_bitmap; r = -ENOENT; - if (!memslot->dirty_bitmap) + if (!dirty_bitmap) goto out; n = kvm_dirty_bitmap_bytes(memslot); - nr_dirty_pages = memslot->nr_dirty_pages; - /* If nothing is dirty, don't bother messing with page tables. */ - if (nr_dirty_pages) { - struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap, *dirty_bitmap_head; + dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap = memslot->dirty_bitmap; - dirty_bitmap_head = memslot->dirty_bitmap_head; - if (dirty_bitmap == dirty_bitmap_head) - dirty_bitmap_head += n / sizeof(long); - memset(dirty_bitmap_head, 0, n); + spin_lock(&kvm->mmu_lock); - r = -ENOMEM; - slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); - if (!slots) - goto out; + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - memslot = id_to_memslot(slots, log->slot); - memslot->nr_dirty_pages = 0; - memslot->dirty_bitmap = dirty_bitmap_head; - update_memslots(slots, NULL); + if (!dirty_bitmap[i]) + continue; - old_slots = kvm->memslots; - rcu_assign_pointer(kvm->memslots, slots); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_slots); + is_dirty = true; - write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; - r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - goto out; - } else { - r = -EFAULT; - if (clear_user(log->dirty_bitmap, n)) - goto out; + offset = i * BITS_PER_LONG; + kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask); } + if (is_dirty) + kvm_flush_remote_tlbs(kvm); + + spin_unlock(&kvm->mmu_lock); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) + goto out; r = 0; out: -- cgit v1.2.3 From e9bda3b3d0ce775afe15eaf71922d342cc74991c Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 20 Mar 2012 23:33:51 -0700 Subject: KVM: VMX: Auto-load on CPUs with VMX Enable x86 feature-based autoloading for the kvm-intel module on CPUs with X86_FEATURE_VMX. Signed-off-by: Josh Triplett Acked-By: Kay Sievers Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad85adfef843..52f685635766 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id vmx_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_VMX), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); + static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); -- cgit v1.2.3 From c36fc04ef558c95cff46a8c89d2f804f217335f5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 8 Mar 2012 12:45:54 +0100 Subject: KVM: x86: add paging gcc optimization Since most guests will have paging enabled for memory management, add likely() optimization around CR0.PG checks. Signed-off-by: Davidlohr Bueso Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index cb80c293cdd8..3d1134ddb885 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu) static inline int is_paging(struct kvm_vcpu *vcpu) { - return kvm_read_cr0_bits(vcpu, X86_CR0_PG); + return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } static inline u32 bit(int bitno) -- cgit v1.2.3 From 220f773a0013bf6fe2eefd9718ac7471f368fd8e Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 21 Mar 2012 23:49:39 +0900 Subject: KVM: MMU: Make pte_list_desc fit cache lines well We have PTE_LIST_EXT + 1 pointers in this structure and these 40/20 bytes do not fit cache lines well. Furthermore, some allocators may use 64/32-byte objects for the pte_list_desc cache. This patch solves this problem by changing PTE_LIST_EXT from 4 to 3. For shadow paging, the new size is still large enough to hold both the kernel and process mappings for usual anonymous pages. For file mappings, there may be a slight change in the cache usage. Note: with EPT/NPT we almost always have a single spte in each reverse mapping and we will not see any change by this. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dc5f2459db6c..3213348e3a93 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -135,8 +135,6 @@ module_param(dbg, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PTE_LIST_EXT 4 - #define ACC_EXEC_MASK 1 #define ACC_WRITE_MASK PT_WRITABLE_MASK #define ACC_USER_MASK PT_USER_MASK @@ -151,6 +149,9 @@ module_param(dbg, bool, 0644); #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) +/* make pte_list_desc fit well in cache line */ +#define PTE_LIST_EXT 3 + struct pte_list_desc { u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; -- cgit v1.2.3 From 1e3f42f03c38c29c1814199a6f0a2f01b919ea3f Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 21 Mar 2012 23:50:34 +0900 Subject: KVM: MMU: Improve iteration through sptes from rmap Iteration using rmap_next(), the actual body is pte_list_next(), is inefficient: every time we call it we start from checking whether rmap holds a single spte or points to a descriptor which links more sptes. In the case of shadow paging, this quadratic total iteration cost is a problem. Even for two dimensional paging, with EPT/NPT on, in which we almost always have a single mapping, the extra checks at the end of the iteration should be eliminated. This patch fixes this by introducing rmap_iterator which keeps the iteration context for the next search. Furthermore the implementation of rmap_next() is splitted into two functions, rmap_get_first() and rmap_get_next(), to avoid repeatedly checking whether the rmap being iterated on has only one spte. Although there seemed to be only a slight change for EPT/NPT, the actual improvement was significant: we observed that GET_DIRTY_LOG for 1GB dirty memory became 15% faster than before. This is probably because the new code is easy to make branch predictions. Note: we just remove pte_list_next() because we can think of parent_ptes as a reverse mapping. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 196 ++++++++++++++++++++++++++++------------------- arch/x86/kvm/mmu_audit.c | 10 +-- 2 files changed, 124 insertions(+), 82 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3213348e3a93..29ad6f9c58a5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -842,32 +842,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, return count; } -static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) -{ - struct pte_list_desc *desc; - u64 *prev_spte; - int i; - - if (!*pte_list) - return NULL; - else if (!(*pte_list & 1)) { - if (!spte) - return (u64 *)*pte_list; - return NULL; - } - desc = (struct pte_list_desc *)(*pte_list & ~1ul); - prev_spte = NULL; - while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { - if (prev_spte == spte) - return desc->sptes[i]; - prev_spte = desc->sptes[i]; - } - desc = desc->more; - } - return NULL; -} - static void pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) @@ -988,11 +962,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(unsigned long *rmapp, u64 *spte) -{ - return pte_list_next(rmapp, spte); -} - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_mmu_page *sp; @@ -1005,6 +974,67 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) pte_list_remove(spte, rmapp); } +/* + * Used by the following functions to iterate through the sptes linked by a + * rmap. All fields are private and not assumed to be used outside. + */ +struct rmap_iterator { + /* private fields */ + struct pte_list_desc *desc; /* holds the sptep if not NULL */ + int pos; /* index of the sptep */ +}; + +/* + * Iteration must be started by this function. This should also be used after + * removing/dropping sptes from the rmap link because in such cases the + * information in the itererator may not be valid. + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) +{ + if (!rmap) + return NULL; + + if (!(rmap & 1)) { + iter->desc = NULL; + return (u64 *)rmap; + } + + iter->desc = (struct pte_list_desc *)(rmap & ~1ul); + iter->pos = 0; + return iter->desc->sptes[iter->pos]; +} + +/* + * Must be used with a valid iterator: e.g. after rmap_get_first(). + * + * Returns sptep if found, NULL otherwise. + */ +static u64 *rmap_get_next(struct rmap_iterator *iter) +{ + if (iter->desc) { + if (iter->pos < PTE_LIST_EXT - 1) { + u64 *sptep; + + ++iter->pos; + sptep = iter->desc->sptes[iter->pos]; + if (sptep) + return sptep; + } + + iter->desc = iter->desc->more; + + if (iter->desc) { + iter->pos = 0; + /* desc->sptes[0] cannot be NULL */ + return iter->desc->sptes[iter->pos]; + } + } + + return NULL; +} + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) @@ -1013,23 +1043,27 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { - u64 *spte = NULL; + u64 *sptep; + struct rmap_iterator iter; int write_protected = 0; - while ((spte = rmap_next(rmapp, spte))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - if (!is_writable_pte(*spte)) + if (!is_writable_pte(*sptep)) { + sptep = rmap_get_next(&iter); continue; + } if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); + mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); + sptep = rmap_get_next(&iter); } else { - BUG_ON(!is_large_pte(*spte)); - drop_spte(kvm, spte); + BUG_ON(!is_large_pte(*sptep)); + drop_spte(kvm, sptep); --kvm->stat.lpages; - spte = NULL; + sptep = rmap_get_first(*rmapp, &iter); } write_protected = 1; @@ -1084,48 +1118,57 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int need_tlb_flush = 0; - while ((spte = rmap_next(rmapp, NULL))) { - BUG_ON(!(*spte & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - drop_spte(kvm, spte); + while ((sptep = rmap_get_first(*rmapp, &iter))) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); + + drop_spte(kvm, sptep); need_tlb_flush = 1; } + return need_tlb_flush; } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { + u64 *sptep; + struct rmap_iterator iter; int need_flush = 0; - u64 *spte, new_spte; + u64 new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(rmapp, NULL); - while (spte) { - BUG_ON(!is_shadow_present_pte(*spte)); - rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); + + for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { + BUG_ON(!is_shadow_present_pte(*sptep)); + rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); + need_flush = 1; + if (pte_write(*ptep)) { - drop_spte(kvm, spte); - spte = rmap_next(rmapp, NULL); + drop_spte(kvm, sptep); + sptep = rmap_get_first(*rmapp, &iter); } else { - new_spte = *spte &~ (PT64_BASE_ADDR_MASK); + new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - mmu_spte_clear_track_bits(spte); - mmu_spte_set(spte, new_spte); - spte = rmap_next(rmapp, spte); + + mmu_spte_clear_track_bits(sptep); + mmu_spte_set(sptep, new_spte); + sptep = rmap_get_next(&iter); } } + if (need_flush) kvm_flush_remote_tlbs(kvm); @@ -1184,7 +1227,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int young = 0; /* @@ -1197,25 +1241,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) return kvm_unmap_rmapp(kvm, rmapp, data); - spte = rmap_next(rmapp, NULL); - while (spte) { - int _young; - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - _young = _spte & PT_ACCESSED_MASK; - if (_young) { + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + if (*sptep & PT_ACCESSED_MASK) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); } - spte = rmap_next(rmapp, spte); } + return young; } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; int young = 0; /* @@ -1226,16 +1269,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(rmapp, NULL); - while (spte) { - u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - young = _spte & PT_ACCESSED_MASK; - if (young) { + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + BUG_ON(!(*sptep & PT_PRESENT_MASK)); + + if (*sptep & PT_ACCESSED_MASK) { young = 1; break; } - spte = rmap_next(rmapp, spte); } out: return young; @@ -1887,10 +1928,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { - u64 *parent_pte; + u64 *sptep; + struct rmap_iterator iter; - while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) - drop_parent_pte(sp, parent_pte); + while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) + drop_parent_pte(sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 715da5a19a5b..7d7d0b9e23eb 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; unsigned long *rmapp; - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; if (sp->role.direct || sp->unsync || sp->role.invalid) return; @@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slot = gfn_to_memslot(kvm, sp->gfn); rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) + for (sptep = rmap_get_first(*rmapp, &iter); sptep; + sptep = rmap_get_next(&iter)) { + if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(rmapp, spte); } } -- cgit v1.2.3 From 4692d77fc3c8978a36406a3cf9e8b899f86f68f1 Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Wed, 4 Apr 2012 19:39:58 +0200 Subject: x86-32: Introduce CONFIG_X86_DEV_DMA_OPS 32-bit x86 systems may need their own DMA operations, so add a new config option, which is turned on for 64-bit systems. This patch has no functional effect but it paves the way for supporting the STA2x11 I/O Hub and possibly other chips. Signed-off-by: Alessandro Rubini Link: http://lkml.kernel.org/r/f79fcc1a2e17ef942e1b798b92aac43a80202532.1333560789.git.rubini@gnudd.com Acked-by: Giancarlo Asnaghi Cc: Alan Cox Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 5 +++++ arch/x86/include/asm/device.h | 4 ++-- arch/x86/include/asm/dma-mapping.h | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..07b412aed38b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -12,6 +12,7 @@ config X86_32 config X86_64 def_bool 64BIT + select X86_DEV_DMA_OPS ### Arch settings config X86 @@ -2215,6 +2216,10 @@ config HAVE_TEXT_POKE_SMP bool select STOP_MACHINE if SMP +config X86_DEV_DMA_OPS + bool + depends on X86_64 + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 63a2a03d7d51..93e1c55f14ab 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -5,8 +5,8 @@ struct dev_archdata { #ifdef CONFIG_ACPI void *acpi_handle; #endif -#ifdef CONFIG_X86_64 -struct dma_map_ops *dma_ops; +#ifdef CONFIG_X86_DEV_DMA_OPS + struct dma_map_ops *dma_ops; #endif #if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU) void *iommu; /* hook for IOMMU specific extension */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 4b4331d71935..09aa473e2917 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -30,7 +30,7 @@ extern struct dma_map_ops *dma_ops; static inline struct dma_map_ops *get_dma_ops(struct device *dev) { -#ifdef CONFIG_X86_32 +#ifndef CONFIG_X86_DEV_DMA_OPS return dma_ops; #else if (unlikely(!dev) || !dev->archdata.dma_ops) -- cgit v1.2.3 From f7219a5300ba753b0c762d631763bd878b8bb00c Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Wed, 4 Apr 2012 19:40:10 +0200 Subject: x86: Introduce CONFIG_X86_DMA_REMAP The default functions phys_to_dma, dma_to_phys implement identity mapping as fast inline functions. Some systems, however, may need a custom function to implement its own mapping between CPU addresses and device addresses. This new configuration option allows the functions to be external when needed (such as for the ConneXt device) Signed-off-by: Alessandro Rubini Link: http://lkml.kernel.org/r/6e4329b772df675f1c442f68e59e844e4dd8c965.1333560789.git.rubini@gnudd.com Acked-by: Giancarlo Asnaghi Cc: Alan Cox Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 3 +++ arch/x86/include/asm/dma-mapping.h | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 07b412aed38b..95ca56036030 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2220,6 +2220,9 @@ config X86_DEV_DMA_OPS bool depends on X86_64 +config X86_DMA_REMAP + bool + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 09aa473e2917..61c0bd25845a 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -62,6 +62,12 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, struct dma_attrs *attrs); +#ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */ +extern bool dma_capable(struct device *dev, dma_addr_t addr, size_t size); +extern dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr); +extern phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr); +#else + static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (!dev->dma_mask) @@ -79,6 +85,7 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) { return daddr; } +#endif /* CONFIG_X86_DMA_REMAP */ static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, -- cgit v1.2.3 From 83125a3a189ec34fb22a04e8efad69ae6d52674a Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Wed, 4 Apr 2012 19:40:21 +0200 Subject: x86, platform: Initial support for sta2x11 I/O hub The "ConneXt" sta2x11 I/O Hub is a bridge from PCIe to AMBA, and is used as main chipset in some Atom boards. The set of peripherals it exports live in an AMBA bus internal to the chip, so a custom remapping of addresses is needed. This is implemented by fixup calls for the PCI deivices, based on CONFIG_X86_DEV_DMA_OPS and CONFIG_X86_DMA_REMAP . Signed-off-by: Alessandro Rubini Link: http://lkml.kernel.org/r/ddca670ca8180e52d49b3fe642742ddd23ab2cb2.1333560789.git.rubini@gnudd.com Acked-by: Giancarlo Asnaghi Cc: Alan Cox Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 28 +++- arch/x86/pci/Makefile | 2 + arch/x86/pci/sta2x11-fixup.c | 366 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 391 insertions(+), 5 deletions(-) create mode 100644 arch/x86/pci/sta2x11-fixup.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 95ca56036030..f9ed801abaf9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -329,6 +329,7 @@ config X86_EXTENDED_PLATFORM NUMAQ (IBM/Sequent) RDC R-321x SoC SGI 320/540 (Visual Workstation) + STA2X11-based (e.g. Northville) Summit/EXA (IBM x440) Unisys ES7000 IA32 series Moorestown MID devices @@ -461,10 +462,10 @@ config X86_32_NON_STANDARD depends on X86_32 && SMP depends on X86_EXTENDED_PLATFORM ---help--- - This option compiles in the NUMAQ, Summit, bigsmp, ES7000, default - subarchitectures. It is intended for a generic binary kernel. - if you select them all, kernel will probe it one by one. and will - fallback to default. + This option compiles in the NUMAQ, Summit, bigsmp, ES7000, + STA2X11, default subarchitectures. It is intended for a generic + binary kernel. If you select them all, kernel will probe it + one by one and will fallback to default. # Alphabetically sorted list of Non standard 32 bit platforms @@ -504,6 +505,22 @@ config X86_VISWS A kernel compiled for the Visual Workstation will run on general PCs as well. See for details. +config STA2X11 + bool "STA2X11 Companion Chip Support" + depends on X86_32_NON_STANDARD && PCI + select X86_DEV_DMA_OPS + select X86_DMA_REMAP + select SWIOTLB + select MFD_STA2X11 + select ARCH_REQUIRE_GPIOLIB + default n + ---help--- + This adds support for boards based on the STA2X11 IO-Hub, + a.k.a. "ConneXt". The chip is used in place of the standard + PC chipset, so all "standard" peripherals are missing. If this + option is selected the kernel will still be able to boot on + standard PC machines. + config X86_SUMMIT bool "Summit/EXA (IBM x440)" depends on X86_32_NON_STANDARD @@ -2218,10 +2235,11 @@ config HAVE_TEXT_POKE_SMP config X86_DEV_DMA_OPS bool - depends on X86_64 + depends on X86_64 || STA2X11 config X86_DMA_REMAP bool + depends on STA2X11 source "net/Kconfig" diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index e76e18c94a3c..3af5a1e79c9c 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -11,6 +11,8 @@ obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o +obj-$(CONFIG_STA2X11) += sta2x11-fixup.o + obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c new file mode 100644 index 000000000000..9d8a509c9730 --- /dev/null +++ b/arch/x86/pci/sta2x11-fixup.c @@ -0,0 +1,366 @@ +/* + * arch/x86/pci/sta2x11-fixup.c + * glue code for lib/swiotlb.c and DMA translation between STA2x11 + * AMBA memory mapping and the X86 memory mapping + * + * ST Microelectronics ConneXt (STA2X11/STA2X10) + * + * Copyright (c) 2010-2011 Wind River Systems, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include + +#define STA2X11_SWIOTLB_SIZE (4*1024*1024) +extern int swiotlb_late_init_with_default_size(size_t default_size); + +/* + * We build a list of bus numbers that are under the ConneXt. The + * main bridge hosts 4 busses, which are the 4 endpoints, in order. + */ +#define STA2X11_NR_EP 4 /* 0..3 included */ +#define STA2X11_NR_FUNCS 8 /* 0..7 included */ +#define STA2X11_AMBA_SIZE (512 << 20) + +struct sta2x11_ahb_regs { /* saved during suspend */ + u32 base, pexlbase, pexhbase, crw; +}; + +struct sta2x11_mapping { + u32 amba_base; + int is_suspended; + struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS]; +}; + +struct sta2x11_instance { + struct list_head list; + int bus0; + struct sta2x11_mapping map[STA2X11_NR_EP]; +}; + +static LIST_HEAD(sta2x11_instance_list); + +/* At probe time, record new instances of this bridge (likely one only) */ +static void sta2x11_new_instance(struct pci_dev *pdev) +{ + struct sta2x11_instance *instance; + + instance = kzalloc(sizeof(*instance), GFP_ATOMIC); + if (!instance) + return; + /* This has a subordinate bridge, with 4 more-subordinate ones */ + instance->bus0 = pdev->subordinate->number + 1; + + if (list_empty(&sta2x11_instance_list)) { + int size = STA2X11_SWIOTLB_SIZE; + /* First instance: register your own swiotlb area */ + dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); + if (swiotlb_late_init_with_default_size(size)) + dev_emerg(&pdev->dev, "init swiotlb failed\n"); + } + list_add(&instance->list, &sta2x11_instance_list); +} +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance); + +/* + * Utility functions used in this file from below + */ +static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev) +{ + struct sta2x11_instance *instance; + int ep; + + list_for_each_entry(instance, &sta2x11_instance_list, list) { + ep = pdev->bus->number - instance->bus0; + if (ep >= 0 && ep < STA2X11_NR_EP) + return instance; + } + return NULL; +} + +static int sta2x11_pdev_to_ep(struct pci_dev *pdev) +{ + struct sta2x11_instance *instance; + + instance = sta2x11_pdev_to_instance(pdev); + if (!instance) + return -1; + + return pdev->bus->number - instance->bus0; +} + +static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev) +{ + struct sta2x11_instance *instance; + int ep; + + instance = sta2x11_pdev_to_instance(pdev); + if (!instance) + return NULL; + ep = sta2x11_pdev_to_ep(pdev); + return instance->map + ep; +} + +/* This is exported, as some devices need to access the MFD registers */ +struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev) +{ + return sta2x11_pdev_to_instance(pdev); +} +EXPORT_SYMBOL(sta2x11_get_instance); + + +/** + * p2a - Translate physical address to STA2x11 AMBA address, + * used for DMA transfers to STA2x11 + * @p: Physical address + * @pdev: PCI device (must be hosted within the connext) + */ +static dma_addr_t p2a(dma_addr_t p, struct pci_dev *pdev) +{ + struct sta2x11_mapping *map; + dma_addr_t a; + + map = sta2x11_pdev_to_mapping(pdev); + a = p + map->amba_base; + return a; +} + +/** + * a2p - Translate STA2x11 AMBA address to physical address + * used for DMA transfers from STA2x11 + * @a: STA2x11 AMBA address + * @pdev: PCI device (must be hosted within the connext) + */ +static dma_addr_t a2p(dma_addr_t a, struct pci_dev *pdev) +{ + struct sta2x11_mapping *map; + dma_addr_t p; + + map = sta2x11_pdev_to_mapping(pdev); + p = a - map->amba_base; + return p; +} + +/** + * sta2x11_swiotlb_alloc_coherent - Allocate swiotlb bounce buffers + * returns virtual address. This is the only "special" function here. + * @dev: PCI device + * @size: Size of the buffer + * @dma_handle: DMA address + * @flags: memory flags + */ +static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, + size_t size, + dma_addr_t *dma_handle, + gfp_t flags, + struct dma_attrs *attrs) +{ + void *vaddr; + + vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs); + if (!vaddr) + vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags); + *dma_handle = p2a(*dma_handle, to_pci_dev(dev)); + return vaddr; +} + +/* We have our own dma_ops: the same as swiotlb but from alloc (above) */ +static struct dma_map_ops sta2x11_dma_ops = { + .alloc = sta2x11_swiotlb_alloc_coherent, + .free = swiotlb_free_coherent, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .mapping_error = swiotlb_dma_mapping_error, + .dma_supported = NULL, /* FIXME: we should use this instead! */ +}; + +/* At setup time, we use our own ops if the device is a ConneXt one */ +static void sta2x11_setup_pdev(struct pci_dev *pdev) +{ + struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); + + if (!instance) /* either a sta2x11 bridge or another ST device */ + return; + pci_set_consistent_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); + pci_set_dma_mask(pdev, STA2X11_AMBA_SIZE - 1); + pdev->dev.archdata.dma_ops = &sta2x11_dma_ops; + + /* We must enable all devices as master, for audio DMA to work */ + pci_set_master(pdev); +} +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev); + +/* + * The following three functions are exported (used in swiotlb: FIXME) + */ +/** + * dma_capable - Check if device can manage DMA transfers (FIXME: kill it) + * @dev: device for a PCI device + * @addr: DMA address + * @size: DMA size + */ +bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + struct sta2x11_mapping *map; + + if (dev->archdata.dma_ops != &sta2x11_dma_ops) { + if (!dev->dma_mask) + return false; + return addr + size - 1 <= *dev->dma_mask; + } + + map = sta2x11_pdev_to_mapping(to_pci_dev(dev)); + + if (!map || (addr < map->amba_base)) + return false; + if (addr + size >= map->amba_base + STA2X11_AMBA_SIZE) { + return false; + } + + return true; +} + +/** + * phys_to_dma - Return the DMA AMBA address used for this STA2x11 device + * @dev: device for a PCI device + * @paddr: Physical address + */ +dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + if (dev->archdata.dma_ops != &sta2x11_dma_ops) + return paddr; + return p2a(paddr, to_pci_dev(dev)); +} + +/** + * dma_to_phys - Return the physical address used for this STA2x11 DMA address + * @dev: device for a PCI device + * @daddr: STA2x11 AMBA DMA address + */ +phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + if (dev->archdata.dma_ops != &sta2x11_dma_ops) + return daddr; + return a2p(daddr, to_pci_dev(dev)); +} + + +/* + * At boot we must set up the mappings for the pcie-to-amba bridge. + * It involves device access, and the same happens at suspend/resume time + */ + +#define AHB_MAPB 0xCA4 +#define AHB_CRW(i) (AHB_MAPB + 0 + (i) * 0x10) +#define AHB_CRW_SZMASK 0xfffffc00UL +#define AHB_CRW_ENABLE (1 << 0) +#define AHB_CRW_WTYPE_MEM (2 << 1) +#define AHB_CRW_ROE (1UL << 3) /* Relax Order Ena */ +#define AHB_CRW_NSE (1UL << 4) /* No Snoop Enable */ +#define AHB_BASE(i) (AHB_MAPB + 4 + (i) * 0x10) +#define AHB_PEXLBASE(i) (AHB_MAPB + 8 + (i) * 0x10) +#define AHB_PEXHBASE(i) (AHB_MAPB + 12 + (i) * 0x10) + +/* At probe time, enable mapping for each endpoint, using the pdev */ +static void sta2x11_map_ep(struct pci_dev *pdev) +{ + struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); + int i; + + if (!map) + return; + pci_read_config_dword(pdev, AHB_BASE(0), &map->amba_base); + + /* Configure AHB mapping */ + pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0); + pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0); + pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE | + AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE); + + /* Disable all the other windows */ + for (i = 1; i < STA2X11_NR_FUNCS; i++) + pci_write_config_dword(pdev, AHB_CRW(i), 0); + + dev_info(&pdev->dev, + "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n", + sta2x11_pdev_to_ep(pdev), map->amba_base, + map->amba_base + STA2X11_AMBA_SIZE - 1); +} +DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep); + +#ifdef CONFIG_PM /* Some register values must be saved and restored */ + +static void suspend_mapping(struct pci_dev *pdev) +{ + struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); + int i; + + if (!map) + return; + + if (map->is_suspended) + return; + map->is_suspended = 1; + + /* Save all window configs */ + for (i = 0; i < STA2X11_NR_FUNCS; i++) { + struct sta2x11_ahb_regs *regs = map->regs + i; + + pci_read_config_dword(pdev, AHB_BASE(i), ®s->base); + pci_read_config_dword(pdev, AHB_PEXLBASE(i), ®s->pexlbase); + pci_read_config_dword(pdev, AHB_PEXHBASE(i), ®s->pexhbase); + pci_read_config_dword(pdev, AHB_CRW(i), ®s->crw); + } +} +DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping); + +static void resume_mapping(struct pci_dev *pdev) +{ + struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); + int i; + + if (!map) + return; + + + if (!map->is_suspended) + goto out; + map->is_suspended = 0; + + /* Restore all window configs */ + for (i = 0; i < STA2X11_NR_FUNCS; i++) { + struct sta2x11_ahb_regs *regs = map->regs + i; + + pci_write_config_dword(pdev, AHB_BASE(i), regs->base); + pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase); + pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase); + pci_write_config_dword(pdev, AHB_CRW(i), regs->crw); + } +out: + pci_set_master(pdev); /* Like at boot, enable master on all devices */ +} +DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping); + +#endif /* CONFIG_PM */ -- cgit v1.2.3 From b7456536cf9466b402b540c5588d79a4177c723a Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Thu, 12 Apr 2012 16:47:56 -0500 Subject: arch/x86: add syscall_get_arch to syscall.h Add syscall_get_arch() to export the current AUDIT_ARCH_* based on system call entry path. Signed-off-by: Will Drewry Acked-by: Serge Hallyn Reviewed-by: H. Peter Anvin Acked-by: Eric Paris Reviewed-by: Kees Cook v18: - update comment about x32 tasks - rebase to v3.4-rc2 v17: rebase and reviewed-by v14: rebase/nochanges v13: rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc Signed-off-by: James Morris --- arch/x86/include/asm/syscall.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 386b78686c4d..1ace47b62592 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -13,9 +13,11 @@ #ifndef _ASM_X86_SYSCALL_H #define _ASM_X86_SYSCALL_H +#include #include #include #include /* For NR_syscalls */ +#include /* for TS_COMPAT */ #include extern const unsigned long sys_call_table[]; @@ -88,6 +90,12 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->bx + i, args, n * sizeof(args[0])); } +static inline int syscall_get_arch(struct task_struct *task, + struct pt_regs *regs) +{ + return AUDIT_ARCH_I386; +} + #else /* CONFIG_X86_64 */ static inline void syscall_get_arguments(struct task_struct *task, @@ -212,6 +220,25 @@ static inline void syscall_set_arguments(struct task_struct *task, } } +static inline int syscall_get_arch(struct task_struct *task, + struct pt_regs *regs) +{ +#ifdef CONFIG_IA32_EMULATION + /* + * TS_COMPAT is set for 32-bit syscall entry and then + * remains set until we return to user mode. + * + * TIF_IA32 tasks should always have TS_COMPAT set at + * system call time. + * + * x32 tasks should be considered AUDIT_ARCH_X86_64. + */ + if (task_thread_info(task)->status & TS_COMPAT) + return AUDIT_ARCH_I386; +#endif + /* Both x32 and x86_64 are considered "64-bit". */ + return AUDIT_ARCH_X86_64; +} #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALL_H */ -- cgit v1.2.3 From a0727e8ce513fe6890416da960181ceb10fbfae6 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Thu, 12 Apr 2012 16:48:00 -0500 Subject: signal, x86: add SIGSYS info and make it synchronous. This change enables SIGSYS, defines _sigfields._sigsys, and adds x86 (compat) arch support. _sigsys defines fields which allow a signal handler to receive the triggering system call number, the relevant AUDIT_ARCH_* value for that number, and the address of the callsite. SIGSYS is added to the SYNCHRONOUS_MASK because it is desirable for it to have setup_frame() called for it. The goal is to ensure that ucontext_t reflects the machine state from the time-of-syscall and not from another signal handler. The first consumer of SIGSYS would be seccomp filter. In particular, a filter program could specify a new return value, SECCOMP_RET_TRAP, which would result in the system call being denied and the calling thread signaled. This also means that implementing arch-specific support can be dependent upon HAVE_ARCH_SECCOMP_FILTER. Suggested-by: H. Peter Anvin Signed-off-by: Will Drewry Acked-by: Serge Hallyn Reviewed-by: H. Peter Anvin Acked-by: Eric Paris v18: - added acked by, rebase v17: - rebase and reviewed-by addition v14: - rebase/nochanges v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc v12: - reworded changelog (oleg@redhat.com) v11: - fix dropped words in the change description - added fallback copy_siginfo support. - added __ARCH_SIGSYS define to allow stepped arch support. v10: - first version based on suggestion Signed-off-by: James Morris --- arch/x86/ia32/ia32_signal.c | 4 ++++ arch/x86/include/asm/ia32.h | 6 ++++++ include/asm-generic/siginfo.h | 22 ++++++++++++++++++++++ kernel/signal.c | 9 ++++++++- 4 files changed, 40 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a69245ba27e3..0b3f2354f6aa 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -67,6 +67,10 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) switch (from->si_code >> 16) { case __SI_FAULT >> 16: break; + case __SI_SYS >> 16: + put_user_ex(from->si_syscall, &to->si_syscall); + put_user_ex(from->si_arch, &to->si_arch); + break; case __SI_CHLD >> 16: if (ia32) { put_user_ex(from->si_utime, &to->si_utime); diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index ee52760549f0..b04cbdb138cd 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -144,6 +144,12 @@ typedef struct compat_siginfo { int _band; /* POLL_IN, POLL_OUT, POLL_MSG */ int _fd; } _sigpoll; + + struct { + unsigned int _call_addr; /* calling insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } _sigsys; } _sifields; } compat_siginfo_t; diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h index 0dd4e87f6fba..31306f55eb02 100644 --- a/include/asm-generic/siginfo.h +++ b/include/asm-generic/siginfo.h @@ -90,9 +90,18 @@ typedef struct siginfo { __ARCH_SI_BAND_T _band; /* POLL_IN, POLL_OUT, POLL_MSG */ int _fd; } _sigpoll; + + /* SIGSYS */ + struct { + void __user *_call_addr; /* calling insn */ + int _syscall; /* triggering system call number */ + unsigned int _arch; /* AUDIT_ARCH_* of syscall */ + } _sigsys; } _sifields; } siginfo_t; +/* If the arch shares siginfo, then it has SIGSYS. */ +#define __ARCH_SIGSYS #endif /* @@ -116,6 +125,11 @@ typedef struct siginfo { #define si_addr_lsb _sifields._sigfault._addr_lsb #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd +#ifdef __ARCH_SIGSYS +#define si_call_addr _sifields._sigsys._call_addr +#define si_syscall _sifields._sigsys._syscall +#define si_arch _sifields._sigsys._arch +#endif #ifdef __KERNEL__ #define __SI_MASK 0xffff0000u @@ -126,6 +140,7 @@ typedef struct siginfo { #define __SI_CHLD (4 << 16) #define __SI_RT (5 << 16) #define __SI_MESGQ (6 << 16) +#define __SI_SYS (7 << 16) #define __SI_CODE(T,N) ((T) | ((N) & 0xffff)) #else #define __SI_KILL 0 @@ -135,6 +150,7 @@ typedef struct siginfo { #define __SI_CHLD 0 #define __SI_RT 0 #define __SI_MESGQ 0 +#define __SI_SYS 0 #define __SI_CODE(T,N) (N) #endif @@ -231,6 +247,12 @@ typedef struct siginfo { #define POLL_HUP (__SI_POLL|6) /* device disconnected */ #define NSIGPOLL 6 +/* + * SIGSYS si_codes + */ +#define SYS_SECCOMP (__SI_SYS|1) /* seccomp triggered */ +#define NSIGSYS 1 + /* * sigevent definitions * diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d0..1a006b5d9d9d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -160,7 +160,7 @@ void recalc_sigpending(void) #define SYNCHRONOUS_MASK \ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ - sigmask(SIGTRAP) | sigmask(SIGFPE)) + sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) int next_signal(struct sigpending *pending, sigset_t *mask) { @@ -2706,6 +2706,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; +#ifdef __ARCH_SIGSYS + case __SI_SYS: + err |= __put_user(from->si_call_addr, &to->si_call_addr); + err |= __put_user(from->si_syscall, &to->si_syscall); + err |= __put_user(from->si_arch, &to->si_arch); + break; +#endif default: /* this is just in case for now ... */ err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); -- cgit v1.2.3 From c6cfbeb4029610c8c330c312dcf4d514cc067554 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Thu, 12 Apr 2012 16:48:03 -0500 Subject: x86: Enable HAVE_ARCH_SECCOMP_FILTER Enable support for seccomp filter on x86: - syscall_get_arch() - syscall_get_arguments() - syscall_rollback() - syscall_set_return_value() - SIGSYS siginfo_t support - secure_computing is called from a ptrace_event()-safe context - secure_computing return value is checked (see below). SECCOMP_RET_TRACE and SECCOMP_RET_TRAP may result in seccomp needing to skip a system call without killing the process. This is done by returning a non-zero (-1) value from secure_computing. This change makes x86 respect that return value. To ensure that minimal kernel code is exposed, a non-zero return value results in an immediate return to user space (with an invalid syscall number). Signed-off-by: Will Drewry Reviewed-by: H. Peter Anvin Acked-by: Eric Paris Reviewed-by: Kees Cook v18: rebase and tweaked change description, acked-by v17: added reviewed by and rebased v..: all rebases since original introduction. Signed-off-by: James Morris --- arch/x86/Kconfig | 1 + arch/x86/kernel/ptrace.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..3a41c4424a0a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,7 @@ config X86 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC + select HAVE_ARCH_SECCOMP_FILTER config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 685845cf16e0..13b1990c7c58 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1480,7 +1480,11 @@ long syscall_trace_enter(struct pt_regs *regs) regs->flags |= X86_EFLAGS_TF; /* do the secure computing check first */ - secure_computing(regs->orig_ax); + if (secure_computing(regs->orig_ax)) { + /* seccomp failures shouldn't expose any additional code. */ + ret = -1L; + goto out; + } if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) ret = -1L; @@ -1505,6 +1509,7 @@ long syscall_trace_enter(struct pt_regs *regs) regs->dx, regs->r10); #endif +out: return ret ?: regs->orig_ax; } -- cgit v1.2.3 From 302616911da8e868d3f1a00dce517ca30b0e065d Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Fri, 6 Apr 2012 14:47:35 +0200 Subject: x86: Drop obsolete ARCH_BOOTMEM support x86 unconditionally uses NO_BOOTMEM so there is no use of the HAVE_ARCH_BOOTMEM support as mm/bootmem.c is the only file referencing this symbol. bootmem_arch_preferred_node() is the function referred in the mm/bootmem.c code and can thuis be dropped too. x86 was the sole user of HAVE_ARCH_BOOTMEM - so there is an opportunity to clean up a little in mm/bootmem.c too if we do not expect other users to emerge. Signed-off-by: Sam Ravnborg Cc: Tejun Heo Link: http://lkml.kernel.org/r/20120406124735.GA6920@merkur.ravnborg.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ---- arch/x86/include/asm/mmzone_32.h | 6 ------ 2 files changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bed94e189fa..a105ee75bd85 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1255,10 +1255,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config HAVE_ARCH_BOOTMEM - def_bool y - depends on X86_32 && NUMA - config HAVE_ARCH_ALLOC_REMAP def_bool y depends on X86_32 && NUMA diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 55728e121473..eb05fb3b02fb 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -61,10 +61,4 @@ static inline int pfn_valid(int pfn) #endif /* CONFIG_DISCONTIGMEM */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -/* always use node 0 for bootmem on this numa platform */ -#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit) \ - (NODE_DATA(0)->bdata) -#endif /* CONFIG_NEED_MULTIPLE_NODES */ - #endif /* _ASM_X86_MMZONE_32_H */ -- cgit v1.2.3 From a7e0e4e99fabe0094fcb0b011b741558b8b28fa7 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Tue, 20 Mar 2012 16:46:14 -0700 Subject: x86: Fix typo in MODULE_DEVICE_TABLE example: s/x86_cpu/x86cpu/ Signed-off-by: Josh Triplett Signed-off-by: Jiri Kosina --- arch/x86/kernel/cpu/match.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 5502b289341b..36565373af87 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -23,7 +23,7 @@ * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) * * Arrays used to match for this should also be declared using - * MODULE_DEVICE_TABLE(x86_cpu, ...) + * MODULE_DEVICE_TABLE(x86cpu, ...) * * This always matches against the boot cpu, assuming models and features are * consistent over all CPUs. -- cgit v1.2.3 From ae75954457eee0a608072368c5b477e40f378d7b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 28 Mar 2012 11:32:28 -0700 Subject: KVM: SVM: Auto-load on CPUs with SVM Enable x86 feature-based autoloading for the kvm-amd module on CPUs with X86_FEATURE_SVM. Signed-off-by: Josh Triplett Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f3167208562e..f75af406b268 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -22,6 +22,7 @@ #include "x86.h" #include +#include #include #include #include @@ -42,6 +43,12 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); +static const struct x86_cpu_id svm_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_SVM), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); + #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -- cgit v1.2.3 From 1c11b37669a5209bd11fb857a103634afef971e8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:39:59 +0300 Subject: KVM: x86 emulator: add support for vector alignment x86 defines three classes of vector instructions: explicitly aligned (#GP(0) if unaligned, explicitly unaligned, and default (which depends on the encoding: AVX is unaligned, SSE is aligned). Add support for marking an instruction as explicitly aligned or unaligned, and mark MOVDQU as unaligned. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 83756223f8aa..6302e5c74341 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -142,6 +142,9 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ +#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ #define X2(x...) x, x #define X3(x...) X2(x), x @@ -557,6 +560,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); } +/* + * x86 defines three classes of vector instructions: explicitly + * aligned, explicitly unaligned, and the rest, which change behaviour + * depending on whether they're AVX encoded or not. + * + * Also included is CMPXCHG16B which is not a vector instruction, yet it is + * subject to the same check. + */ +static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +{ + if (likely(size < 16)) + return false; + + if (ctxt->d & Aligned) + return true; + else if (ctxt->d & Unaligned) + return false; + else if (ctxt->d & Avx) + return false; + else + return true; +} + static int __linearize(struct x86_emulate_ctxt *ctxt, struct segmented_address addr, unsigned size, bool write, bool fetch, @@ -621,6 +647,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, } if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8) la &= (u32)-1; + if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + return emulate_gp(ctxt, 0); *linear = la; return X86EMUL_CONTINUE; bad: @@ -3415,7 +3443,7 @@ static struct opcode group11[] = { }; static struct gprefix pfx_0f_6f_0f_7f = { - N, N, N, I(Sse, em_movdqu), + N, N, N, I(Sse | Unaligned, em_movdqu), }; static struct opcode opcode_table[256] = { -- cgit v1.2.3 From 49597d8116ad70aabb598e606b218ddd9315b0af Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 9 Apr 2012 18:40:00 +0300 Subject: KVM: x86: emulate movdqa An Ubuntu 9.10 Karmic Koala guest is unable to boot or install due to missing movdqa emulation: kvm_exit: reason EXCEPTION_NMI rip 0x7fef3e025a7b info 7fef3e799000 80000b0e kvm_page_fault: address 7fef3e799000 error_code f kvm_emulate_insn: 0:7fef3e025a7b: 66 0f 7f 07 (prot64) movdqa %xmm0,(%rdi) [avi: mark it explicitly aligned] Signed-off-by: Stefan Hajnoczi Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6302e5c74341..b160fb1fc68b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2818,7 +2818,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) static int em_mov(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.val = ctxt->src.val; + memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); return X86EMUL_CONTINUE; } @@ -2898,12 +2898,6 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } -static int em_movdqu(struct x86_emulate_ctxt *ctxt) -{ - memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes); - return X86EMUL_CONTINUE; -} - static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; @@ -3443,7 +3437,7 @@ static struct opcode group11[] = { }; static struct gprefix pfx_0f_6f_0f_7f = { - N, N, N, I(Sse | Unaligned, em_movdqu), + N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; static struct opcode opcode_table[256] = { -- cgit v1.2.3 From 3e114eb4db3a33141b8c91bb53dae9ba6b015a32 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:40:01 +0300 Subject: KVM: x86 emulator: implement movntps Used to write to framebuffers (by at least Icaros). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b160fb1fc68b..fb39e0b32ed1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3440,6 +3440,10 @@ static struct gprefix pfx_0f_6f_0f_7f = { N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; +static struct gprefix pfx_vmovntpx = { + I(0, em_mov), N, N, N, +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), @@ -3571,7 +3575,8 @@ static struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, - N, N, N, N, N, N, N, N, + N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx), + N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), -- cgit v1.2.3 From cbe2c9d30aa69b0551247ddb0fb450b6e8080ec4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:40:02 +0300 Subject: KVM: x86 emulator: MMX support General support for the MMX instruction set. Special care is taken to trap pending x87 exceptions so that they are properly reflected to the guest. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 4 +- arch/x86/kvm/emulate.c | 103 +++++++++++++++++++++++++++++++++++-- 2 files changed, 102 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c222e1a1b12a..1ac46c22dd50 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -200,7 +200,7 @@ typedef u32 __attribute__((vector_size(16))) sse128_t; /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; unsigned int bytes; union { unsigned long orig_val; @@ -213,12 +213,14 @@ struct operand { unsigned seg; } mem; unsigned xmm; + unsigned mm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(unsigned long) + 2]; sse128_t vec_val; + u64 mm_val; }; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fb39e0b32ed1..0011b4ad44b5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -142,6 +142,7 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +#define Mmx ((u64)1 << 40) /* MMX Vector instruction */ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ @@ -887,6 +888,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, ctxt->ops->put_fpu(ctxt); } +static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %%mm0, %0" : "=m"(*data)); break; + case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; + case 2: asm("movq %%mm2, %0" : "=m"(*data)); break; + case 3: asm("movq %%mm3, %0" : "=m"(*data)); break; + case 4: asm("movq %%mm4, %0" : "=m"(*data)); break; + case 5: asm("movq %%mm5, %0" : "=m"(*data)); break; + case 6: asm("movq %%mm6, %0" : "=m"(*data)); break; + case 7: asm("movq %%mm7, %0" : "=m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movq %0, %%mm0" : : "m"(*data)); break; + case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; + case 2: asm("movq %0, %%mm2" : : "m"(*data)); break; + case 3: asm("movq %0, %%mm3" : : "m"(*data)); break; + case 4: asm("movq %0, %%mm4" : : "m"(*data)); break; + case 5: asm("movq %0, %%mm5" : : "m"(*data)); break; + case 6: asm("movq %0, %%mm6" : : "m"(*data)); break; + case 7: asm("movq %0, %%mm7" : : "m"(*data)); break; + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -903,6 +938,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, reg); return; } + if (ctxt->d & Mmx) { + reg &= 7; + op->type = OP_MM; + op->bytes = 8; + op->addr.mm = reg; + return; + } op->type = OP_REG; if (ctxt->d & ByteOp) { @@ -948,6 +990,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm); return rc; } + if (ctxt->d & Mmx) { + op->type = OP_MM; + op->bytes = 8; + op->addr.xmm = ctxt->modrm_rm & 7; + return rc; + } fetch_register_operand(op); return rc; } @@ -1415,6 +1463,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt) case OP_XMM: write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm); break; + case OP_MM: + write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm); + break; case OP_NONE: /* no writeback */ break; @@ -3987,6 +4038,8 @@ done_prefixes: if (ctxt->d & Sse) ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { @@ -4057,6 +4110,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) return false; } +static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) +{ + bool fault = false; + + ctxt->ops->get_fpu(ctxt); + asm volatile("1: fwait \n\t" + "2: \n\t" + ".pushsection .fixup,\"ax\" \n\t" + "3: \n\t" + "movb $1, %[fault] \n\t" + "jmp 2b \n\t" + ".popsection \n\t" + _ASM_EXTABLE(1b, 3b) + : [fault]"+rm"(fault)); + ctxt->ops->put_fpu(ctxt); + + if (unlikely(fault)) + return emulate_exception(ctxt, MF_VECTOR, 0, false); + + return X86EMUL_CONTINUE; +} + +static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op) +{ + if (op->type == OP_MM) + read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -4081,18 +4163,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if ((ctxt->d & Sse) - && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) - || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { rc = emulate_ud(ctxt); goto done; } - if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); -- cgit v1.2.3 From e59717550e5cf0e7159c5b7af1d1ead35fef49dd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 9 Apr 2012 18:40:03 +0300 Subject: KVM: x86 emulator: implement MMX MOVQ (opcodes 0f 6f, 0f 7f) Needed by some framebuffer drivers. See https://bugzilla.kernel.org/show_bug.cgi?id=42779 Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0011b4ad44b5..d5729a91d08d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3488,7 +3488,7 @@ static struct opcode group11[] = { }; static struct gprefix pfx_0f_6f_0f_7f = { - N, I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), + I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; static struct gprefix pfx_vmovntpx = { -- cgit v1.2.3 From a0c9a822bf37e6282eb6006b407ec5aec22e08fb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 11 Apr 2012 18:49:55 +0300 Subject: KVM: dont clear TMR on EOI Intel spec says that TMR needs to be set/cleared when IRR is set, but kvm also clears it on EOI. I did some tests on a real (AMD based) system, and I see same TMR values both before and after EOI, so I think it's a minor bug in kvm. This patch fixes TMR to be set/cleared on IRR set only as per spec. And now that we don't clear TMR, we can save an atomic read of TMR on EOI that's not propagated to ioapic, by checking whether ioapic needs a specific vector first and calculating the mode afterwards. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 19 +++++++++++++------ virt/kvm/ioapic.c | 10 +++++++--- virt/kvm/ioapic.h | 1 + 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 858432287ab6..992b4eaae684 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap) return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int apic_test_vector(int vec, void *bitmap) +{ + return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline void apic_set_vector(int vec, void *bitmap) { set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -480,7 +485,6 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - int trigger_mode; /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -491,12 +495,15 @@ static void apic_set_eoi(struct kvm_lapic *apic) apic_clear_vector(vector, apic->regs + APIC_ISR); apic_update_ppr(apic); - if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index dcaf272c26c0..26fd54dc459e 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -254,13 +254,17 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector, } } +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector) +{ + struct kvm_ioapic *ioapic = kvm->arch.vioapic; + smp_rmb(); + return test_bit(vector, ioapic->handled_vectors); +} + void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - smp_rmb(); - if (!test_bit(vector, ioapic->handled_vectors)) - return; spin_lock(&ioapic->lock); __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode); spin_unlock(&ioapic->lock); diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 0b190c34ccc3..32872a09b63f 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -71,6 +71,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); +bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_destroy(struct kvm *kvm); int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); -- cgit v1.2.3 From 9fe2a7015393dc0203ac39242ae9c89038994f3c Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 23 Mar 2012 13:36:28 +0530 Subject: debugfs: Add support to print u32 array in debugfs Move the code from Xen to debugfs to make the code common for other users as well. Accked-by: Greg Kroah-Hartman Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Suzuki Poulose [v1: Fixed rebase issues] [v2: Fixed PPC compile issues] Signed-off-by: Raghavendra K T Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/debugfs.c | 104 --------------------------------------- arch/x86/xen/debugfs.h | 4 -- arch/x86/xen/spinlock.c | 12 ++--- fs/debugfs/file.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/debugfs.h | 11 +++++ 5 files changed, 145 insertions(+), 114 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index ef1db1900d86..c8377fb26cdf 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -19,107 +19,3 @@ struct dentry * __init xen_init_debugfs(void) return d_xen_debug; } -struct array_data -{ - void *array; - unsigned elements; -}; - -static int u32_array_open(struct inode *inode, struct file *file) -{ - file->private_data = NULL; - return nonseekable_open(inode, file); -} - -static size_t format_array(char *buf, size_t bufsize, const char *fmt, - u32 *array, unsigned array_size) -{ - size_t ret = 0; - unsigned i; - - for(i = 0; i < array_size; i++) { - size_t len; - - len = snprintf(buf, bufsize, fmt, array[i]); - len++; /* ' ' or '\n' */ - ret += len; - - if (buf) { - buf += len; - bufsize -= len; - buf[-1] = (i == array_size-1) ? '\n' : ' '; - } - } - - ret++; /* \0 */ - if (buf) - *buf = '\0'; - - return ret; -} - -static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) -{ - size_t len = format_array(NULL, 0, fmt, array, array_size); - char *ret; - - ret = kmalloc(len, GFP_KERNEL); - if (ret == NULL) - return NULL; - - format_array(ret, len, fmt, array, array_size); - return ret; -} - -static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, - loff_t *ppos) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct array_data *data = inode->i_private; - size_t size; - - if (*ppos == 0) { - if (file->private_data) { - kfree(file->private_data); - file->private_data = NULL; - } - - file->private_data = format_array_alloc("%u", data->array, data->elements); - } - - size = 0; - if (file->private_data) - size = strlen(file->private_data); - - return simple_read_from_buffer(buf, len, ppos, file->private_data, size); -} - -static int xen_array_release(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - - return 0; -} - -static const struct file_operations u32_array_fops = { - .owner = THIS_MODULE, - .open = u32_array_open, - .release= xen_array_release, - .read = u32_array_read, - .llseek = no_llseek, -}; - -struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, - struct dentry *parent, - u32 *array, unsigned elements) -{ - struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); - - if (data == NULL) - return NULL; - - data->array = array; - data->elements = elements; - - return debugfs_create_file(name, mode, parent, data, &u32_array_fops); -} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h index 78d25499be5b..12ebf3325c7b 100644 --- a/arch/x86/xen/debugfs.h +++ b/arch/x86/xen/debugfs.h @@ -3,8 +3,4 @@ struct dentry * __init xen_init_debugfs(void); -struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, - struct dentry *parent, - u32 *array, unsigned elements); - #endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index d69cc6c3f808..83e866d714ce 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -440,12 +440,12 @@ static int __init xen_spinlock_debugfs(void) debugfs_create_u64("time_total", 0444, d_spin_debug, &spinlock_stats.time_total); - xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); - xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); return 0; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 5dfafdd1dbd3..2340f6978d6e 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -20,6 +20,7 @@ #include #include #include +#include static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -520,6 +521,133 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_blob); +struct array_data { + void *array; + u32 elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, u32 array_size) +{ + size_t ret = 0; + u32 i; + + for (i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, + u32 array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, + data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, + file->private_data, size); +} + +static int u32_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static const struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release = u32_array_release, + .read = u32_array_read, + .llseek = no_llseek, +}; + +/** + * debugfs_create_u32_array - create a debugfs file that is used to read u32 + * array. + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @array: u32 array that provides data. + * @elements: total number of elements in the array. + * + * This function creates a file in debugfs with the given name that exports + * @array as data. If the @mode variable is so set it can be read from. + * Writing is not supported. Seek within the file is also not supported. + * Once array is created its size can not be changed. + * + * The function returns a pointer to dentry on success. If debugfs is not + * enabled in the kernel, the value -%ENODEV will be returned. + */ +struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, + struct dentry *parent, + u32 *array, u32 elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} +EXPORT_SYMBOL_GPL(debugfs_create_u32_array); + #ifdef CONFIG_HAS_IOMEM /* diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index ae36b72c22f3..66c434f5dd1e 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -93,6 +93,10 @@ struct dentry *debugfs_create_regset32(const char *name, umode_t mode, int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, int nregs, void __iomem *base, char *prefix); +struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, + struct dentry *parent, + u32 *array, u32 elements); + bool debugfs_initialized(void); #else @@ -219,6 +223,13 @@ static inline bool debugfs_initialized(void) return false; } +static inline struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, + struct dentry *parent, + u32 *array, u32 elements) +{ + return ERR_PTR(-ENODEV); +} + #endif #endif -- cgit v1.2.3 From f71fa31f9f7ac33cba12b8897983f950ad2c7a5b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 18 Apr 2012 12:24:29 +0200 Subject: KVM: MMU: use page table level macro Its much cleaner to use PT_PAGE_TABLE_LEVEL than its numeric value. Signed-off-by: Davidlohr Bueso Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/paging_tmpl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 29ad6f9c58a5..07424cf60434 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3618,7 +3618,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ - if (sp->role.level == 1) + if (sp->role.level == PT_PAGE_TABLE_LEVEL) return false; return ++sp->write_flooding_count >= 3; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index df5a70311be8..34f970937ef1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -658,7 +658,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != 1); + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; -- cgit v1.2.3 From 95022b8cf6ed7f3292b60c8e85fe59a12bfb1c9e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 18 Apr 2012 15:19:40 -0700 Subject: x86/mce: Avoid reading every machine check bank register twice. Reading machine check bank registers is slow. There is a trend of increasing the number of banks, and the number of cores. The main section of do_machine_check() is a serialized section where each cpu in turn checks every bank. Even on a little two socket SandyBridge-EP system that multiplies out as: 2 sockets * 8 cores * 2 hyperthreads * 20 banks = 640 MSRs We already scan the banks in parallel in mce_no_way_out() to see if there is a fatal error anywhere in the system. If we build a cache of VALID bits during this scan, we can avoid uselessly re-reading banks that have no data. Note that this cache is only a hint. If the valid bit is set in a shared bank, all cpus that share that bank will see it during the parallel scan, but the first to find it in the sequential scan will (usually) clear the bank. Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087d..66e1c51be084 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -641,16 +641,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) { - int i; + int i, ret = 0; for (i = 0; i < banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + if (m->status & MCI_STATUS_VAL) + __set_bit(i, validp); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) - return 1; + ret = 1; } - return 0; + return ret; } /* @@ -1011,6 +1013,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; atomic_inc(&mce_entry); @@ -1025,7 +1028,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) final = &__get_cpu_var(mces_seen); *final = m; - no_way_out = mce_no_way_out(&m, &msg); + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks); barrier(); @@ -1045,6 +1049,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; if (!mce_banks[i].ctl) continue; -- cgit v1.2.3 From d405c60128a1973648058fa950a8960ec1f27e38 Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 19 Apr 2012 14:59:59 -0700 Subject: x86: Select BUILDTIME_EXTABLE_SORT We can sort the exeception table at build time for x86, so let's do it. Signed-off-by: David Daney Link: http://lkml.kernel.org/r/1334872799-14589-6-git-send-email-ddaney.cavm@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..2f925ccb3e5b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,7 @@ config X86 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC + select BUILDTIME_EXTABLE_SORT config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) -- cgit v1.2.3 From 46326013e34eb5c178a91f06c1f2e99e79eed924 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 18 Apr 2012 17:16:46 -0700 Subject: x86, nop: Make the ASM_NOP* macros work from assembly Make the ASM_NOP* macros work in actual assembly files. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1334794610-5546-2-git-send-email-hpa@zytor.com --- arch/x86/include/asm/nops.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index 405b4032a60b..aff2b3356101 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -87,7 +87,11 @@ #define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0 #define P6_NOP5_ATOMIC P6_NOP5 +#ifdef __ASSEMBLY__ +#define _ASM_MK_NOP(x) .byte x +#else #define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n" +#endif #if defined(CONFIG_MK7) #define ASM_NOP1 _ASM_MK_NOP(K7_NOP1) -- cgit v1.2.3 From 84f4fc524eed040660bd4ebc8cba259d8afe8461 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 18 Apr 2012 17:16:47 -0700 Subject: x86: Add symbolic constant for exceptions with error code Add a symbolic constant for the bitmask which states which exceptions carry an error code. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1334794610-5546-3-git-send-email-hpa@zytor.com --- arch/x86/include/asm/segment.h | 2 ++ arch/x86/kernel/head_64.S | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 165466233ab0..58c1e6cd91b6 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -205,6 +205,8 @@ #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 +/* Bitmask of exception vectors which push an error code on the stack */ +#define EXCEPTION_ERRCODE_MASK 0x00027d00 #define GDT_SIZE (GDT_ENTRIES * 8) #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 40f4eb3766d1..adf52e85d551 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -295,7 +295,7 @@ ENTRY(early_idt_handler) ja 0f movl $1,%eax salq %cl,%rax - testl $0x27d00,%eax + testl $EXCEPTION_ERRCODE_MASK,%eax je 0f popq %r8 # get error code 0: movq 0(%rsp),%rcx # get ip -- cgit v1.2.3 From ffc4bc9c6fa4eaf935d96d139bfa7443cac0b88e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 18 Apr 2012 17:16:48 -0700 Subject: x86, paravirt: Replace GET_CR2_INTO_RCX with GET_CR2_INTO_RAX GET_CR2_INTO_RCX is asinine: it is only used in one place, the actual paravirt call returns the value in %rax, not %rcx; and the one place that wants it wants the result in %r9. We actually generate as a result of this call: call ... movq %rax, %rcx xorq %rax, %rax /* this value isn't even used... */ movq %rcx, %r9 At least make the macro do what the paravirt call does, which is put the value into %rax. Nevermind the fact that the macro clobbers all the volatile registers. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1334794610-5546-4-git-send-email-hpa@zytor.com Cc: Glauber de Oliveira Costa --- arch/x86/include/asm/paravirt.h | 6 ++---- arch/x86/kernel/head_64.S | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index aa0f91308367..6cbbabf52707 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1023,10 +1023,8 @@ extern void default_banner(void); call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \ ) -#define GET_CR2_INTO_RCX \ - call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \ - movq %rax, %rcx; \ - xorq %rax, %rax; +#define GET_CR2_INTO_RAX \ + call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) #define PARAVIRT_ADJUST_EXCEPTION_FRAME \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index adf52e85d551..d1e112c8b577 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -23,8 +23,9 @@ #ifdef CONFIG_PARAVIRT #include #include +#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg #else -#define GET_CR2_INTO_RCX movq %cr2, %rcx +#define GET_CR2_INTO(reg) movq %cr2, reg #endif /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -286,8 +287,7 @@ ENTRY(early_idt_handler) cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) - GET_CR2_INTO_RCX - movq %rcx,%r9 + GET_CR2_INTO(%r9) xorl %r8d,%r8d # zero for error code movl %esi,%ecx # get vector number # Test %ecx against mask of vectors that push error code. -- cgit v1.2.3 From 6a1ea279c210e7dc05de86dc29c0d4f577f484fb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Apr 2012 15:24:20 -0700 Subject: x86, extable: Add early_fixup_exception() Add a restricted version of fixup_exception() to be used during early boot only. In particular, this doesn't support the try..catch variant since we may not have a thread_info set up yet. This relies on the exception table being sorted already at build time. Link: http://lkml.kernel.org/r/1334794610-5546-1-git-send-email-hpa@zytor.com Signed-off-by: H. Peter Anvin --- arch/x86/mm/extable.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 1fb85dbe390a..5555675dadb6 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -35,3 +35,20 @@ int fixup_exception(struct pt_regs *regs) return 0; } + +/* Restricted version used during very early boot */ +int __init early_fixup_exception(unsigned long *ip) +{ + const struct exception_table_entry *fixup; + + fixup = search_exception_tables(*ip); + if (fixup) { + if (fixup->fixup < 16) + return 0; /* Not supported during early boot */ + + *ip = fixup->fixup; + return 1; + } + + return 0; +} -- cgit v1.2.3 From 9900aa2f95844eb81428c1d3d202c01b7f3ac77a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 18 Apr 2012 17:16:49 -0700 Subject: x86-64: Handle exception table entries during early boot If we get an exception during early boot, walk the exception table to see if we should intercept it. The main use case for this is to allow rdmsr_safe()/wrmsr_safe() during CPU initialization. Since the exception table is currently sorted at runtime, and fairly late in startup, this code walks the exception table linearly. We obviously don't need to worry about modules, however: none have been loaded at this point. [ v2: Use early_fixup_exception() instead of linear search ] Link: http://lkml.kernel.org/r/1334794610-5546-5-git-send-email-hpa@zytor.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/segment.h | 2 +- arch/x86/kernel/head_64.S | 76 +++++++++++++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 58c1e6cd91b6..c48a95035a77 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -213,7 +213,7 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10]; +extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; /* * Load a segment. Fall back on loading the zero diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1e112c8b577..7a40f2447321 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_PARAVIRT #include @@ -26,6 +27,7 @@ #define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg #else #define GET_CR2_INTO(reg) movq %cr2, reg +#define INTERRUPT_RETURN iretq #endif /* we are not able to switch in one step to the final KERNEL ADDRESS SPACE @@ -271,35 +273,56 @@ bad_address: jmp bad_address .section ".init.text","ax" -#ifdef CONFIG_EARLY_PRINTK .globl early_idt_handlers early_idt_handlers: + # 104(%rsp) %rflags + # 96(%rsp) %cs + # 88(%rsp) %rip + # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - movl $i, %esi + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushq $0 # Dummy error code, to make stack frame uniform + .endif + pushq $i # 72(%rsp) Vector number jmp early_idt_handler i = i + 1 .endr -#endif ENTRY(early_idt_handler) -#ifdef CONFIG_EARLY_PRINTK + cld + cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) - GET_CR2_INTO(%r9) - xorl %r8d,%r8d # zero for error code - movl %esi,%ecx # get vector number - # Test %ecx against mask of vectors that push error code. - cmpl $31,%ecx - ja 0f - movl $1,%eax - salq %cl,%rax - testl $EXCEPTION_ERRCODE_MASK,%eax - je 0f - popq %r8 # get error code -0: movq 0(%rsp),%rcx # get ip - movq 8(%rsp),%rdx # get cs + + pushq %rax # 64(%rsp) + pushq %rcx # 56(%rsp) + pushq %rdx # 48(%rsp) + pushq %rsi # 40(%rsp) + pushq %rdi # 32(%rsp) + pushq %r8 # 24(%rsp) + pushq %r9 # 16(%rsp) + pushq %r10 # 8(%rsp) + pushq %r11 # 0(%rsp) + + cmpl $__KERNEL_CS,96(%rsp) + jne 10f + + leaq 88(%rsp),%rdi # Pointer to %rip + call early_fixup_exception + andl %eax,%eax + jnz 20f # Found an exception entry + +10: +#ifdef CONFIG_EARLY_PRINTK + GET_CR2_INTO(%r9) # can clobber any volatile register if pv + movl 80(%rsp),%r8d # error code + movl 72(%rsp),%esi # vector number + movl 96(%rsp),%edx # %cs + movq 88(%rsp),%rcx # %rip xorl %eax,%eax leaq early_idt_msg(%rip),%rdi call early_printk @@ -308,17 +331,32 @@ ENTRY(early_idt_handler) call dump_stack #ifdef CONFIG_KALLSYMS leaq early_idt_ripmsg(%rip),%rdi - movq 0(%rsp),%rsi # get rip again + movq 40(%rsp),%rsi # %rip again call __print_symbol #endif #endif /* EARLY_PRINTK */ 1: hlt jmp 1b -#ifdef CONFIG_EARLY_PRINTK +20: # Exception table entry found + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %rax + addq $16,%rsp # drop vector number and error code + decl early_recursion_flag(%rip) + INTERRUPT_RETURN + + .balign 4 early_recursion_flag: .long 0 +#ifdef CONFIG_EARLY_PRINTK early_idt_msg: .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" early_idt_ripmsg: -- cgit v1.2.3 From f78146b0f9230765c6315b2e14f56112513389ad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Apr 2012 19:22:47 +0300 Subject: KVM: Fix page-crossing MMIO MMIO that are split across a page boundary are currently broken - the code does not expect to be aborted by the exit to userspace for the first MMIO fragment. This patch fixes the problem by generalizing the current code for handling 16-byte MMIOs to handle a number of "fragments", and changes the MMIO code to create those fragments. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/ia64/include/asm/kvm_host.h | 2 + arch/ia64/kvm/kvm-ia64.c | 10 ++-- arch/x86/kvm/x86.c | 114 +++++++++++++++++++++++++++------------ include/linux/kvm_host.h | 31 +++++++++-- 4 files changed, 115 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index c4b4bac3d09e..6d6a5ac48d85 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -449,6 +449,8 @@ struct kvm_vcpu_arch { char log_buf[VMM_LOG_LEN]; union context host; union context guest; + + char mmio_data[8]; }; struct kvm_vm_stat { diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 9d80ff8d9eff..882ab21a8dcd 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -232,12 +232,12 @@ static int handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if ((p->addr & PAGE_MASK) == IOAPIC_DEFAULT_BASE_ADDRESS) goto mmio; vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = kvm_run->mmio.phys_addr = p->addr; - vcpu->mmio_size = kvm_run->mmio.len = p->size; + vcpu->mmio_fragments[0].gpa = kvm_run->mmio.phys_addr = p->addr; + vcpu->mmio_fragments[0].len = kvm_run->mmio.len = p->size; vcpu->mmio_is_write = kvm_run->mmio.is_write = !p->dir; if (vcpu->mmio_is_write) - memcpy(vcpu->mmio_data, &p->data, p->size); + memcpy(vcpu->arch.mmio_data, &p->data, p->size); memcpy(kvm_run->mmio.data, &p->data, p->size); kvm_run->exit_reason = KVM_EXIT_MMIO; return 0; @@ -719,7 +719,7 @@ static void kvm_set_mmio_data(struct kvm_vcpu *vcpu) struct kvm_mmio_req *p = kvm_get_vcpu_ioreq(vcpu); if (!vcpu->mmio_is_write) - memcpy(&p->data, vcpu->mmio_data, 8); + memcpy(&p->data, vcpu->arch.mmio_data, 8); p->state = STATE_IORESP_READY; } @@ -739,7 +739,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + memcpy(vcpu->arch.mmio_data, kvm_run->mmio.data, 8); kvm_set_mmio_data(vcpu); vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d9a57875f0b..4de705cdcafd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3718,9 +3718,8 @@ struct read_write_emulator_ops { static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) { if (vcpu->mmio_read_completed) { - memcpy(val, vcpu->mmio_data, bytes); trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, - vcpu->mmio_phys_addr, *(u64 *)val); + vcpu->mmio_fragments[0].gpa, *(u64 *)val); vcpu->mmio_read_completed = 0; return 1; } @@ -3756,8 +3755,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - memcpy(vcpu->mmio_data, val, bytes); - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; + + memcpy(vcpu->run->mmio.data, frag->data, frag->len); return X86EMUL_CONTINUE; } @@ -3784,10 +3784,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, gpa_t gpa; int handled, ret; bool write = ops->write; - - if (ops->read_write_prepare && - ops->read_write_prepare(vcpu, val, bytes)) - return X86EMUL_CONTINUE; + struct kvm_mmio_fragment *frag; ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -3813,15 +3810,19 @@ mmio: bytes -= handled; val += handled; - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; - vcpu->mmio_index = 0; + while (bytes) { + unsigned now = min(bytes, 8U); - return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = now; + + gpa += now; + val += now; + bytes -= now; + } + return X86EMUL_CONTINUE; } int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, @@ -3830,10 +3831,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + gpa_t gpa; + int rc; + + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; + + vcpu->mmio_nr_fragments = 0; /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { - int rc, now; + int now; now = -addr & ~PAGE_MASK; rc = emulator_read_write_onepage(addr, val, now, exception, @@ -3846,8 +3855,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, bytes -= now; } - return emulator_read_write_onepage(addr, val, bytes, exception, - vcpu, ops); + rc = emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (!vcpu->mmio_nr_fragments) + return rc; + + gpa = vcpu->mmio_fragments[0].gpa; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = gpa; + + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, @@ -5446,33 +5472,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +/* + * Implements the following, as a state machine: + * + * read: + * for each fragment + * write gpa, len + * exit + * copy data + * execute insn + * + * write: + * for each fragment + * write gpa, len + * copy data + * exit + */ static int complete_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; + struct kvm_mmio_fragment *frag; int r; if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) return 1; if (vcpu->mmio_needed) { - vcpu->mmio_needed = 0; + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; if (!vcpu->mmio_is_write) - memcpy(vcpu->mmio_data + vcpu->mmio_index, - run->mmio.data, 8); - vcpu->mmio_index += 8; - if (vcpu->mmio_index < vcpu->mmio_size) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; - memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); - run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); - run->mmio.is_write = vcpu->mmio_is_write; - vcpu->mmio_needed = 1; - return 0; + memcpy(frag->data, run->mmio.data, frag->len); + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { + vcpu->mmio_needed = 0; + if (vcpu->mmio_is_write) + return 1; + vcpu->mmio_read_completed = 1; + goto done; } + /* Initiate next fragment */ + ++frag; + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) - return 1; - vcpu->mmio_read_completed = 1; + memcpy(run->mmio.data, frag->data, frag->len); + run->mmio.len = frag->len; + run->mmio.is_write = vcpu->mmio_is_write; + return 0; + } +done: vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a2d00b1bbf54..186ffab0b9f0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -34,6 +34,20 @@ #define KVM_MMIO_SIZE 8 #endif +/* + * If we support unaligned MMIO, at most one fragment will be split into two: + */ +#ifdef KVM_UNALIGNED_MMIO +# define KVM_EXTRA_MMIO_FRAGMENTS 1 +#else +# define KVM_EXTRA_MMIO_FRAGMENTS 0 +#endif + +#define KVM_USER_MMIO_SIZE 8 + +#define KVM_MAX_MMIO_FRAGMENTS \ + (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) + /* * vcpu->requests bit members */ @@ -117,6 +131,16 @@ enum { EXITING_GUEST_MODE }; +/* + * Sometimes a large or cross-page mmio needs to be broken up into separate + * exits for userspace servicing. + */ +struct kvm_mmio_fragment { + gpa_t gpa; + void *data; + unsigned len; +}; + struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -144,10 +168,9 @@ struct kvm_vcpu { int mmio_needed; int mmio_read_completed; int mmio_is_write; - int mmio_size; - int mmio_index; - unsigned char mmio_data[KVM_MMIO_SIZE]; - gpa_t mmio_phys_addr; + int mmio_cur_fragment; + int mmio_nr_fragments; + struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS]; #endif #ifdef CONFIG_KVM_ASYNC_PF -- cgit v1.2.3 From 4c5023a3fa2ec12b7ed313b276b157917575745b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 18 Apr 2012 17:16:50 -0700 Subject: x86-32: Handle exception table entries during early boot If we get an exception during early boot, walk the exception table to see if we should intercept it. The main use case for this is to allow rdmsr_safe()/wrmsr_safe() during CPU initialization. Since the exception table is currently sorted at runtime, and fairly late in startup, this code walks the exception table linearly. We obviously don't need to worry about modules, however: none have been loaded at this point. This patch changes the early IDT setup to look a lot more like x86-64: we now install handlers for all 32 exception vectors. The output of the early exception handler has changed somewhat as it directly reflects the stack frame of the exception handler, and the stack frame has been somewhat restructured. Finally, centralize the code that can and should be run only once. [ v2: Use early_fixup_exception() instead of linear search ] Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1334794610-5546-6-git-send-email-hpa@zytor.com --- arch/x86/kernel/head_32.S | 223 +++++++++++++++++++++++++++------------------- 1 file changed, 129 insertions(+), 94 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ce0be7cd085e..463c9797ca6a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -21,6 +21,7 @@ #include #include #include +#include /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -363,28 +364,23 @@ default_entry: pushl $0 popfl -#ifdef CONFIG_SMP - cmpb $0, ready - jnz checkCPUtype -#endif /* CONFIG_SMP */ - /* * start system 32-bit setup. We need to re-do some of the things done * in 16-bit mode for the "real" operations. */ - call setup_idt - -checkCPUtype: - - movl $-1,X86_CPUID # -1 for no CPUID initially - + movl setup_once_ref,%eax + andl %eax,%eax + jz 1f # Did we do this already? + call *%eax +1: + /* check if it is 486 or 386. */ /* * XXX - this does a lot of unnecessary setup. Alignment checks don't * apply at our cpl of 0 and the stack ought to be aligned already, and * we don't need to preserve eflags. */ - + movl $-1,X86_CPUID # -1 for no CPUID initially movb $3,X86 # at least 386 pushfl # push EFLAGS popl %eax # get EFLAGS @@ -450,21 +446,6 @@ is386: movl $2,%ecx # set MP movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu -#ifdef CONFIG_CC_STACKPROTECTOR - /* - * The linker can't handle this by relocation. Manually set - * base address in stack canary segment descriptor. - */ - cmpb $0,ready - jne 1f - movl $gdt_page,%eax - movl $stack_canary,%ecx - movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) - shrl $16, %ecx - movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) - movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) -1: -#endif movl $(__KERNEL_STACK_CANARY),%eax movl %eax,%gs @@ -473,7 +454,6 @@ is386: movl $2,%ecx # set MP cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder - movb $1, ready jmp *(initial_code) /* @@ -495,81 +475,122 @@ check_x87: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret + +#include "verify_cpu.S" + /* - * setup_idt + * setup_once * - * sets up a idt with 256 entries pointing to - * ignore_int, interrupt gates. It doesn't actually load - * idt - that can be done only after paging has been enabled - * and the kernel moved to PAGE_OFFSET. Interrupts - * are enabled elsewhere, when we can be relatively - * sure everything is ok. + * The setup work we only want to run on the BSP. * * Warning: %esi is live across this function. */ -setup_idt: - lea ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ +__INIT +setup_once: + /* + * Set up a idt with 256 entries pointing to ignore_int, + * interrupt gates. It doesn't actually load idt - that needs + * to be done on each CPU. Interrupts are enabled elsewhere, + * when we can be relatively sure everything is ok. + */ - lea idt_table,%edi - mov $256,%ecx -rp_sidt: + movl $idt_table,%edi + movl $early_idt_handlers,%eax + movl $NUM_EXCEPTION_VECTORS,%ecx +1: movl %eax,(%edi) - movl %edx,4(%edi) + movl %eax,4(%edi) + /* interrupt gate, dpl=0, present */ + movl $(0x8E000000 + __KERNEL_CS),2(%edi) + addl $9,%eax addl $8,%edi - dec %ecx - jne rp_sidt + loop 1b -.macro set_early_handler handler,trapno - lea \handler,%edx + movl $256 - NUM_EXCEPTION_VECTORS,%ecx + movl $ignore_int,%edx movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax + movw %dx,%ax /* selector = 0x0010 = cs */ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ - lea idt_table,%edi - movl %eax,8*\trapno(%edi) - movl %edx,8*\trapno+4(%edi) -.endm +2: + movl %eax,(%edi) + movl %edx,4(%edi) + addl $8,%edi + loop 2b - set_early_handler handler=early_divide_err,trapno=0 - set_early_handler handler=early_illegal_opcode,trapno=6 - set_early_handler handler=early_protection_fault,trapno=13 - set_early_handler handler=early_page_fault,trapno=14 +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * Configure the stack canary. The linker can't handle this by + * relocation. Manually set base address in stack canary + * segment descriptor. + */ + movl $gdt_page,%eax + movl $stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + andl $0,setup_once_ref /* Once is enough, thanks */ ret -early_divide_err: - xor %edx,%edx - pushl $0 /* fake errcode */ - jmp early_fault +ENTRY(early_idt_handlers) + # 36(%esp) %eflags + # 32(%esp) %cs + # 28(%esp) %eip + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number + jmp early_idt_handler + i = i + 1 + .endr +ENDPROC(early_idt_handlers) + + /* This is global to keep gas from relaxing the jumps */ +ENTRY(early_idt_handler) + cld + cmpl $2,%ss:early_recursion_flag + je hlt_loop + incl %ss:early_recursion_flag -early_illegal_opcode: - movl $6,%edx - pushl $0 /* fake errcode */ - jmp early_fault + push %eax # 16(%esp) + push %ecx # 12(%esp) + push %edx # 8(%esp) + push %ds # 4(%esp) + push %es # 0(%esp) + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es -early_protection_fault: - movl $13,%edx - jmp early_fault + cmpl $(__KERNEL_CS),32(%esp) + jne 10f -early_page_fault: - movl $14,%edx - jmp early_fault + leal 28(%esp),%eax # Pointer to %eip + call early_fixup_exception + andl %eax,%eax + jnz ex_entry /* found an exception entry */ -early_fault: - cld +10: #ifdef CONFIG_PRINTK - pusha - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - cmpl $2,early_recursion_flag - je hlt_loop - incl early_recursion_flag + xorl %eax,%eax + movw %ax,2(%esp) /* clean up the segment values on some cpus */ + movw %ax,6(%esp) + movw %ax,34(%esp) + leal 40(%esp),%eax + pushl %eax /* %esp before the exception */ + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi movl %cr2,%eax pushl %eax - pushl %edx /* trapno */ + pushl (20+6*4)(%esp) /* trapno */ pushl $fault_msg call printk #endif @@ -578,6 +599,17 @@ hlt_loop: hlt jmp hlt_loop +ex_entry: + pop %es + pop %ds + pop %edx + pop %ecx + pop %eax + addl $8,%esp /* drop vector number and error code */ + decl %ss:early_recursion_flag + iret +ENDPROC(early_idt_handler) + /* This is the default interrupt "handler" :-) */ ALIGN ignore_int: @@ -611,13 +643,18 @@ ignore_int: popl %eax #endif iret +ENDPROC(ignore_int) +__INITDATA + .align 4 +early_recursion_flag: + .long 0 -#include "verify_cpu.S" - - __REFDATA -.align 4 +__REFDATA + .align 4 ENTRY(initial_code) .long i386_start_kernel +ENTRY(setup_once_ref) + .long setup_once /* * BSS section @@ -670,22 +707,19 @@ ENTRY(initial_page_table) ENTRY(stack_start) .long init_thread_union+THREAD_SIZE -early_recursion_flag: - .long 0 - -ready: .byte 0 - +__INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" fault_msg: /* fault info: */ .ascii "BUG: Int %d: CR2 %p\n" -/* pusha regs: */ - .ascii " EDI %p ESI %p EBP %p ESP %p\n" - .ascii " EBX %p EDX %p ECX %p EAX %p\n" +/* regs pushed in early_idt_handler: */ + .ascii " EDI %p ESI %p EBP %p EBX %p\n" + .ascii " ESP %p ES %p DS %p\n" + .ascii " EDX %p ECX %p EAX %p\n" /* fault frame: */ - .ascii " err %p EIP %p CS %p flg %p\n" + .ascii " vec %p err %p EIP %p CS %p flg %p\n" .ascii "Stack: %p %p %p %p %p %p %p %p\n" .ascii " %p %p %p %p %p %p %p %p\n" .asciz " %p %p %p %p %p %p %p %p\n" @@ -699,6 +733,7 @@ fault_msg: * segment size, and 32-bit linear address value: */ + .data .globl boot_gdt_descr .globl idt_descr -- cgit v1.2.3 From 060feb650010c261fcfbae9de9348b46cedcd3cd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Apr 2012 17:07:34 -0700 Subject: x86, doc: Revert "x86: Document rdmsr_safe restrictions" This reverts commit ce37defc0f6673f5ca2c92ed5cfcaf290ae7dd16 "x86: Document rdmsr_safe restrictions", as these restrictions no longer apply. Reported-by: Borislav Petkov Link: http://lkml.kernel.org/r/20120419171609.GH3221@aftab.osrc.amd.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 95203d40ffdd..084ef95274cd 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -169,14 +169,7 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) return native_write_msr_safe(msr, low, high); } -/* - * rdmsr with exception handling. - * - * Please note that the exception handling works only after we've - * switched to the "smart" #GP handler in trap_init() which knows about - * exception tables - using this macro earlier than that causes machine - * hangs on boxes which do not implement the @msr in the first argument. - */ +/* rdmsr with exception handling */ #define rdmsr_safe(msr, p1, p2) \ ({ \ int __err; \ -- cgit v1.2.3 From d4541805e812abb5110d5de83246488fa0aa9a8e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:12:27 -0700 Subject: x86, extable: Use .pushsection ... .popsection for _ASM_EXTABLE() Instead of using .section ... .previous, use .pushsection ... .popsection; this is (hopefully) a bit more robust, especially in complex assembly code. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/include/asm/asm.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 9412d6558c88..ff3f6bffcbf9 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -42,17 +42,17 @@ /* Exception table entry */ #ifdef __ASSEMBLY__ -# define _ASM_EXTABLE(from,to) \ - __ASM_EX_SEC ; \ - _ASM_ALIGN ; \ - _ASM_PTR from , to ; \ - .previous +# define _ASM_EXTABLE(from,to) \ + .pushsection "__ex_table","a" ; \ + _ASM_ALIGN ; \ + _ASM_PTR from , to ; \ + .popsection #else -# define _ASM_EXTABLE(from,to) \ - __ASM_EX_SEC \ - _ASM_ALIGN "\n" \ - _ASM_PTR #from "," #to "\n" \ - " .previous\n" +# define _ASM_EXTABLE(from,to) \ + " .pushsection \"__ex_table\",\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .popsection\n" #endif #endif /* _ASM_X86_ASM_H */ -- cgit v1.2.3 From 1ce6f86815a392acce2b45512106b525dc994cc0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:50 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/ia32/ia32entry.S Remove open-coded exception table entries in arch/x86/ia32/ia32entry.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/ia32/ia32entry.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e3e734005e19..eb48edd0cad2 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -146,9 +147,7 @@ ENTRY(ia32_sysenter_target) /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%ebp - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous + _ASM_EXTABLE(1b,ia32_badarg) orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE -- cgit v1.2.3 From 6837a54dd6127f055dcb26d00fee0df05c07a674 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:50 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/kernel/entry_32.S Remove open-coded exception table entries in arch/x86/kernel/entry_32.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/kernel/entry_32.S | 47 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7b784f4ef1e4..01ccf9b71473 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -56,6 +56,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -151,10 +152,8 @@ .pushsection .fixup, "ax" 99: movl $0, (%esp) jmp 98b -.section __ex_table, "a" - .align 4 - .long 98b, 99b .popsection + _ASM_EXTABLE(98b,99b) .endm .macro PTGS_TO_GS @@ -164,10 +163,8 @@ .pushsection .fixup, "ax" 99: movl $0, PT_GS(%esp) jmp 98b -.section __ex_table, "a" - .align 4 - .long 98b, 99b .popsection + _ASM_EXTABLE(98b,99b) .endm .macro GS_TO_REG reg @@ -249,12 +246,10 @@ jmp 2b 6: movl $0, (%esp) jmp 3b -.section __ex_table, "a" - .align 4 - .long 1b, 4b - .long 2b, 5b - .long 3b, 6b .popsection + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,5b) + _ASM_EXTABLE(3b,6b) POP_GS_EX .endm @@ -415,10 +410,7 @@ sysenter_past_esp: jae syscall_fault 1: movl (%ebp),%ebp movl %ebp,PT_EBP(%esp) -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous + _ASM_EXTABLE(1b,syscall_fault) GET_THREAD_INFO(%ebp) @@ -485,10 +477,8 @@ sysexit_audit: .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) jmp 1b -.section __ex_table,"a" - .align 4 - .long 1b,2b .popsection + _ASM_EXTABLE(1b,2b) PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) @@ -543,10 +533,7 @@ ENTRY(iret_exc) pushl $do_iret_error jmp error_code .previous -.section __ex_table,"a" - .align 4 - .long irq_return,iret_exc -.previous + _ASM_EXTABLE(irq_return,iret_exc) CFI_RESTORE_STATE ldt_ss: @@ -901,10 +888,7 @@ END(device_not_available) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iret -.section __ex_table,"a" - .align 4 - .long native_iret, iret_exc -.previous + _ASM_EXTABLE(native_iret, iret_exc) END(native_iret) ENTRY(native_irq_enable_sysexit) @@ -1093,13 +1077,10 @@ ENTRY(xen_failsafe_callback) movl %eax,16(%esp) jmp 4b .previous -.section __ex_table,"a" - .align 4 - .long 1b,6b - .long 2b,7b - .long 3b,8b - .long 4b,9b -.previous + _ASM_EXTABLE(1b,6b) + _ASM_EXTABLE(2b,7b) + _ASM_EXTABLE(3b,8b) + _ASM_EXTABLE(4b,9b) ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, -- cgit v1.2.3 From d7abc0fa997972ddb6d3c403e03a6eefda0c0881 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:50 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/kernel/entry_64.S Remove open-coded exception table entries in arch/x86/kernel/entry_64.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/kernel/entry_64.S | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cdc79b5cfcd9..320852d02026 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -900,18 +901,12 @@ restore_args: irq_return: INTERRUPT_RETURN - - .section __ex_table, "a" - .quad irq_return, bad_iret - .previous + _ASM_EXTABLE(irq_return, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) iretq - - .section __ex_table,"a" - .quad native_iret, bad_iret - .previous + _ASM_EXTABLE(native_iret, bad_iret) #endif .section .fixup,"ax" @@ -1181,10 +1176,7 @@ gs_change: CFI_ENDPROC END(native_load_gs_index) - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous + _ASM_EXTABLE(gs_change,bad_gs) .section .fixup,"ax" /* running with kernelgs */ bad_gs: -- cgit v1.2.3 From 5d6f8d77ede50417dcca4c31a74f0d40a1ee537a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:50 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/kernel/test_rodata.c Remove open-coded exception table entries in arch/x86/kernel/test_rodata.c, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/kernel/test_rodata.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index c29e235792af..b79133abda48 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -12,6 +12,7 @@ #include #include #include +#include int rodata_test(void) { @@ -42,14 +43,7 @@ int rodata_test(void) ".section .fixup,\"ax\"\n" "2: jmp 1b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 16\n" -#ifdef CONFIG_X86_32 - " .long 0b,2b\n" -#else - " .quad 0b,2b\n" -#endif - ".previous" + _ASM_EXTABLE(0b,2b) : [rslt] "=r" (result) : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) ); -- cgit v1.2.3 From 5f2e8a84f07bb43f9c0ce317d7e0c5e541db00e3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:50 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/lib/checksum_32.S Remove open-coded exception table entries in arch/x86/lib/checksum_32.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/lib/checksum_32.S | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 78d16a554db0..2af5df3ade7c 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -28,6 +28,7 @@ #include #include #include +#include /* * computes a partial checksum, e.g. for TCP/UDP fragments @@ -282,15 +283,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define SRC(y...) \ 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6001f ; \ - .previous + _ASM_EXTABLE(9999b, 6001f) #define DST(y...) \ 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6002f ; \ - .previous + _ASM_EXTABLE(9999b, 6002f) #ifndef CONFIG_X86_USE_PPRO_CHECKSUM -- cgit v1.2.3 From 9732da8ca860053515431298ec969e1f3e6bc64a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:51 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/lib/copy_user_64.S Remove open-coded exception table entries in arch/x86/lib/copy_user_64.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/lib/copy_user_64.S | 63 +++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 024840266ba0..5b2995f4557a 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,6 +16,7 @@ #include #include #include +#include /* * By placing feature2 after feature1 in altinstructions section, we logically @@ -63,11 +64,8 @@ jmp copy_user_handle_tail .previous - .section __ex_table,"a" - .align 8 - .quad 100b,103b - .quad 101b,103b - .previous + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) #endif .endm @@ -191,29 +189,26 @@ ENTRY(copy_user_generic_unrolled) 60: jmp copy_user_handle_tail /* ecx is zerorest also */ .previous - .section __ex_table,"a" - .align 8 - .quad 1b,30b - .quad 2b,30b - .quad 3b,30b - .quad 4b,30b - .quad 5b,30b - .quad 6b,30b - .quad 7b,30b - .quad 8b,30b - .quad 9b,30b - .quad 10b,30b - .quad 11b,30b - .quad 12b,30b - .quad 13b,30b - .quad 14b,30b - .quad 15b,30b - .quad 16b,30b - .quad 18b,40b - .quad 19b,40b - .quad 21b,50b - .quad 22b,50b - .previous + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) CFI_ENDPROC ENDPROC(copy_user_generic_unrolled) @@ -259,11 +254,8 @@ ENTRY(copy_user_generic_string) jmp copy_user_handle_tail .previous - .section __ex_table,"a" - .align 8 - .quad 1b,11b - .quad 3b,12b - .previous + _ASM_EXTABLE(1b,11b) + _ASM_EXTABLE(3b,12b) CFI_ENDPROC ENDPROC(copy_user_generic_string) @@ -294,9 +286,6 @@ ENTRY(copy_user_enhanced_fast_string) jmp copy_user_handle_tail .previous - .section __ex_table,"a" - .align 8 - .quad 1b,12b - .previous + _ASM_EXTABLE(1b,12b) CFI_ENDPROC ENDPROC(copy_user_enhanced_fast_string) -- cgit v1.2.3 From 0d8559feafbc9dc5a2c17ba42aea7de824b18308 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:51 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/lib/copy_user_nocache_64.S Remove open-coded exception table entries in arch/x86/lib/copy_user_nocache_64.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/lib/copy_user_nocache_64.S | 50 +++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index cb0c112386fb..cacddc7163eb 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -14,6 +14,7 @@ #include #include #include +#include .macro ALIGN_DESTINATION #ifdef FIX_ALIGNMENT @@ -36,11 +37,8 @@ jmp copy_user_handle_tail .previous - .section __ex_table,"a" - .align 8 - .quad 100b,103b - .quad 101b,103b - .previous + _ASM_EXTABLE(100b,103b) + _ASM_EXTABLE(101b,103b) #endif .endm @@ -111,27 +109,25 @@ ENTRY(__copy_user_nocache) jmp copy_user_handle_tail .previous - .section __ex_table,"a" - .quad 1b,30b - .quad 2b,30b - .quad 3b,30b - .quad 4b,30b - .quad 5b,30b - .quad 6b,30b - .quad 7b,30b - .quad 8b,30b - .quad 9b,30b - .quad 10b,30b - .quad 11b,30b - .quad 12b,30b - .quad 13b,30b - .quad 14b,30b - .quad 15b,30b - .quad 16b,30b - .quad 18b,40b - .quad 19b,40b - .quad 21b,50b - .quad 22b,50b - .previous + _ASM_EXTABLE(1b,30b) + _ASM_EXTABLE(2b,30b) + _ASM_EXTABLE(3b,30b) + _ASM_EXTABLE(4b,30b) + _ASM_EXTABLE(5b,30b) + _ASM_EXTABLE(6b,30b) + _ASM_EXTABLE(7b,30b) + _ASM_EXTABLE(8b,30b) + _ASM_EXTABLE(9b,30b) + _ASM_EXTABLE(10b,30b) + _ASM_EXTABLE(11b,30b) + _ASM_EXTABLE(12b,30b) + _ASM_EXTABLE(13b,30b) + _ASM_EXTABLE(14b,30b) + _ASM_EXTABLE(15b,30b) + _ASM_EXTABLE(16b,30b) + _ASM_EXTABLE(18b,40b) + _ASM_EXTABLE(19b,40b) + _ASM_EXTABLE(21b,50b) + _ASM_EXTABLE(22b,50b) CFI_ENDPROC ENDPROC(__copy_user_nocache) -- cgit v1.2.3 From 015e6f11a9737684469feef9d523373b1746159d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:51 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/lib/csum-copy_64.S Remove open-coded exception table entries in arch/x86/lib/csum-copy_64.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/lib/csum-copy_64.S | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index fb903b758da8..2419d5fefae3 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -8,6 +8,7 @@ #include #include #include +#include /* * Checksum copy with exception handling. @@ -31,26 +32,17 @@ .macro source 10: - .section __ex_table, "a" - .align 8 - .quad 10b, .Lbad_source - .previous + _ASM_EXTABLE(10b, .Lbad_source) .endm .macro dest 20: - .section __ex_table, "a" - .align 8 - .quad 20b, .Lbad_dest - .previous + _ASM_EXTABLE(20b, .Lbad_dest) .endm .macro ignore L=.Lignore 30: - .section __ex_table, "a" - .align 8 - .quad 30b, \L - .previous + _ASM_EXTABLE(30b, \L) .endm -- cgit v1.2.3 From 1a27bc0d99aabea6b628cb994a21a1c79b569fc9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:51 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/lib/getuser.S Remove open-coded exception table entries in arch/x86/lib/getuser.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/lib/getuser.S | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 51f1504cddd9..b33b1fb1e6d4 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -95,10 +95,9 @@ bad_get_user: CFI_ENDPROC END(bad_get_user) -.section __ex_table,"a" - _ASM_PTR 1b,bad_get_user - _ASM_PTR 2b,bad_get_user - _ASM_PTR 3b,bad_get_user + _ASM_EXTABLE(1b,bad_get_user) + _ASM_EXTABLE(2b,bad_get_user) + _ASM_EXTABLE(3b,bad_get_user) #ifdef CONFIG_X86_64 - _ASM_PTR 4b,bad_get_user + _ASM_EXTABLE(4b,bad_get_user) #endif -- cgit v1.2.3 From a53a96e5413d3639ed75d202bbfe68aa0a56c091 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:52 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/lib/putuser.S Remove open-coded exception table entries in arch/x86/lib/putuser.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/lib/putuser.S | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 36b0d15ae6e9..7f951c8f76c4 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -86,12 +86,10 @@ bad_put_user: EXIT END(bad_put_user) -.section __ex_table,"a" - _ASM_PTR 1b,bad_put_user - _ASM_PTR 2b,bad_put_user - _ASM_PTR 3b,bad_put_user - _ASM_PTR 4b,bad_put_user + _ASM_EXTABLE(1b,bad_put_user) + _ASM_EXTABLE(2b,bad_put_user) + _ASM_EXTABLE(3b,bad_put_user) + _ASM_EXTABLE(4b,bad_put_user) #ifdef CONFIG_X86_32 - _ASM_PTR 5b,bad_put_user + _ASM_EXTABLE(5b,bad_put_user) #endif -.previous -- cgit v1.2.3 From 9c6751280b6206e2a96f9600938003a29968e4fa Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:52 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/lib/usercopy_32.c Remove open-coded exception table entries in arch/x86/lib/usercopy_32.c, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/lib/usercopy_32.c | 232 +++++++++++++++++++++------------------------ 1 file changed, 106 insertions(+), 126 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index ef2a6a5d78e3..883b216c60b2 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_INTEL_USERCOPY /* @@ -127,10 +128,7 @@ long strnlen_user(const char __user *s, long n) "3: movb $1,%%al\n" " jmp 1b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,2b\n" - ".previous" + _ASM_EXTABLE(0b,2b) :"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp) :"0" (n), "1" (s), "2" (0), "3" (mask) :"cc"); @@ -199,47 +197,44 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size) "101: lea 0(%%eax,%0,4),%0\n" " jmp 100b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b,100b\n" - " .long 2b,100b\n" - " .long 3b,100b\n" - " .long 4b,100b\n" - " .long 5b,100b\n" - " .long 6b,100b\n" - " .long 7b,100b\n" - " .long 8b,100b\n" - " .long 9b,100b\n" - " .long 10b,100b\n" - " .long 11b,100b\n" - " .long 12b,100b\n" - " .long 13b,100b\n" - " .long 14b,100b\n" - " .long 15b,100b\n" - " .long 16b,100b\n" - " .long 17b,100b\n" - " .long 18b,100b\n" - " .long 19b,100b\n" - " .long 20b,100b\n" - " .long 21b,100b\n" - " .long 22b,100b\n" - " .long 23b,100b\n" - " .long 24b,100b\n" - " .long 25b,100b\n" - " .long 26b,100b\n" - " .long 27b,100b\n" - " .long 28b,100b\n" - " .long 29b,100b\n" - " .long 30b,100b\n" - " .long 31b,100b\n" - " .long 32b,100b\n" - " .long 33b,100b\n" - " .long 34b,100b\n" - " .long 35b,100b\n" - " .long 36b,100b\n" - " .long 37b,100b\n" - " .long 99b,101b\n" - ".previous" + _ASM_EXTABLE(1b,100b) + _ASM_EXTABLE(2b,100b) + _ASM_EXTABLE(3b,100b) + _ASM_EXTABLE(4b,100b) + _ASM_EXTABLE(5b,100b) + _ASM_EXTABLE(6b,100b) + _ASM_EXTABLE(7b,100b) + _ASM_EXTABLE(8b,100b) + _ASM_EXTABLE(9b,100b) + _ASM_EXTABLE(10b,100b) + _ASM_EXTABLE(11b,100b) + _ASM_EXTABLE(12b,100b) + _ASM_EXTABLE(13b,100b) + _ASM_EXTABLE(14b,100b) + _ASM_EXTABLE(15b,100b) + _ASM_EXTABLE(16b,100b) + _ASM_EXTABLE(17b,100b) + _ASM_EXTABLE(18b,100b) + _ASM_EXTABLE(19b,100b) + _ASM_EXTABLE(20b,100b) + _ASM_EXTABLE(21b,100b) + _ASM_EXTABLE(22b,100b) + _ASM_EXTABLE(23b,100b) + _ASM_EXTABLE(24b,100b) + _ASM_EXTABLE(25b,100b) + _ASM_EXTABLE(26b,100b) + _ASM_EXTABLE(27b,100b) + _ASM_EXTABLE(28b,100b) + _ASM_EXTABLE(29b,100b) + _ASM_EXTABLE(30b,100b) + _ASM_EXTABLE(31b,100b) + _ASM_EXTABLE(32b,100b) + _ASM_EXTABLE(33b,100b) + _ASM_EXTABLE(34b,100b) + _ASM_EXTABLE(35b,100b) + _ASM_EXTABLE(36b,100b) + _ASM_EXTABLE(37b,100b) + _ASM_EXTABLE(99b,101b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -312,29 +307,26 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size) " popl %0\n" " jmp 8b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,16b\n" - " .long 1b,16b\n" - " .long 2b,16b\n" - " .long 21b,16b\n" - " .long 3b,16b\n" - " .long 31b,16b\n" - " .long 4b,16b\n" - " .long 41b,16b\n" - " .long 10b,16b\n" - " .long 51b,16b\n" - " .long 11b,16b\n" - " .long 61b,16b\n" - " .long 12b,16b\n" - " .long 71b,16b\n" - " .long 13b,16b\n" - " .long 81b,16b\n" - " .long 14b,16b\n" - " .long 91b,16b\n" - " .long 6b,9b\n" - " .long 7b,16b\n" - ".previous" + _ASM_EXTABLE(0b,16b) + _ASM_EXTABLE(1b,16b) + _ASM_EXTABLE(2b,16b) + _ASM_EXTABLE(21b,16b) + _ASM_EXTABLE(3b,16b) + _ASM_EXTABLE(31b,16b) + _ASM_EXTABLE(4b,16b) + _ASM_EXTABLE(41b,16b) + _ASM_EXTABLE(10b,16b) + _ASM_EXTABLE(51b,16b) + _ASM_EXTABLE(11b,16b) + _ASM_EXTABLE(61b,16b) + _ASM_EXTABLE(12b,16b) + _ASM_EXTABLE(71b,16b) + _ASM_EXTABLE(13b,16b) + _ASM_EXTABLE(81b,16b) + _ASM_EXTABLE(14b,16b) + _ASM_EXTABLE(91b,16b) + _ASM_EXTABLE(6b,9b) + _ASM_EXTABLE(7b,16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -414,29 +406,26 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to, " popl %0\n" " jmp 8b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,16b\n" - " .long 1b,16b\n" - " .long 2b,16b\n" - " .long 21b,16b\n" - " .long 3b,16b\n" - " .long 31b,16b\n" - " .long 4b,16b\n" - " .long 41b,16b\n" - " .long 10b,16b\n" - " .long 51b,16b\n" - " .long 11b,16b\n" - " .long 61b,16b\n" - " .long 12b,16b\n" - " .long 71b,16b\n" - " .long 13b,16b\n" - " .long 81b,16b\n" - " .long 14b,16b\n" - " .long 91b,16b\n" - " .long 6b,9b\n" - " .long 7b,16b\n" - ".previous" + _ASM_EXTABLE(0b,16b) + _ASM_EXTABLE(1b,16b) + _ASM_EXTABLE(2b,16b) + _ASM_EXTABLE(21b,16b) + _ASM_EXTABLE(3b,16b) + _ASM_EXTABLE(31b,16b) + _ASM_EXTABLE(4b,16b) + _ASM_EXTABLE(41b,16b) + _ASM_EXTABLE(10b,16b) + _ASM_EXTABLE(51b,16b) + _ASM_EXTABLE(11b,16b) + _ASM_EXTABLE(61b,16b) + _ASM_EXTABLE(12b,16b) + _ASM_EXTABLE(71b,16b) + _ASM_EXTABLE(13b,16b) + _ASM_EXTABLE(81b,16b) + _ASM_EXTABLE(14b,16b) + _ASM_EXTABLE(91b,16b) + _ASM_EXTABLE(6b,9b) + _ASM_EXTABLE(7b,16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -505,29 +494,26 @@ static unsigned long __copy_user_intel_nocache(void *to, "9: lea 0(%%eax,%0,4),%0\n" "16: jmp 8b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 0b,16b\n" - " .long 1b,16b\n" - " .long 2b,16b\n" - " .long 21b,16b\n" - " .long 3b,16b\n" - " .long 31b,16b\n" - " .long 4b,16b\n" - " .long 41b,16b\n" - " .long 10b,16b\n" - " .long 51b,16b\n" - " .long 11b,16b\n" - " .long 61b,16b\n" - " .long 12b,16b\n" - " .long 71b,16b\n" - " .long 13b,16b\n" - " .long 81b,16b\n" - " .long 14b,16b\n" - " .long 91b,16b\n" - " .long 6b,9b\n" - " .long 7b,16b\n" - ".previous" + _ASM_EXTABLE(0b,16b) + _ASM_EXTABLE(1b,16b) + _ASM_EXTABLE(2b,16b) + _ASM_EXTABLE(21b,16b) + _ASM_EXTABLE(3b,16b) + _ASM_EXTABLE(31b,16b) + _ASM_EXTABLE(4b,16b) + _ASM_EXTABLE(41b,16b) + _ASM_EXTABLE(10b,16b) + _ASM_EXTABLE(51b,16b) + _ASM_EXTABLE(11b,16b) + _ASM_EXTABLE(61b,16b) + _ASM_EXTABLE(12b,16b) + _ASM_EXTABLE(71b,16b) + _ASM_EXTABLE(13b,16b) + _ASM_EXTABLE(81b,16b) + _ASM_EXTABLE(14b,16b) + _ASM_EXTABLE(91b,16b) + _ASM_EXTABLE(6b,9b) + _ASM_EXTABLE(7b,16b) : "=&c"(size), "=&D" (d0), "=&S" (d1) : "1"(to), "2"(from), "0"(size) : "eax", "edx", "memory"); @@ -574,12 +560,9 @@ do { \ "3: lea 0(%3,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 4b,5b\n" \ - " .long 0b,3b\n" \ - " .long 1b,2b\n" \ - ".previous" \ + _ASM_EXTABLE(4b,5b) \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,2b) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ : "3"(size), "0"(size), "1"(to), "2"(from) \ : "memory"); \ @@ -616,12 +599,9 @@ do { \ " popl %0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 4b,5b\n" \ - " .long 0b,3b\n" \ - " .long 1b,6b\n" \ - ".previous" \ + _ASM_EXTABLE(4b,5b) \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,6b) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \ : "3"(size), "0"(size), "1"(to), "2"(from) \ : "memory"); \ -- cgit v1.2.3 From f542c5d6e57ea32daae3708a71911d9f5c883c5a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:52 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/um/checksum_32.S Remove open-coded exception table entries in arch/x86/um/checksum_32.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Cc: Richard Weinberger Cc: Al Viro Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/um/checksum_32.S | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S index f058d2f82e18..8d0c420465cc 100644 --- a/arch/x86/um/checksum_32.S +++ b/arch/x86/um/checksum_32.S @@ -26,6 +26,7 @@ */ #include +#include /* * computes a partial checksum, e.g. for TCP/UDP fragments @@ -232,15 +233,11 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst, #define SRC(y...) \ 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6001f ; \ - .previous + _ASM_EXTABLE(9999b, 6001f) #define DST(y...) \ 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6002f ; \ - .previous + _ASM_EXTABLE(9999b, 6002f) .align 4 -- cgit v1.2.3 From 8f6380b9ec1cc4bed9b38144f739b87dd2cddb1d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:19:52 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/xen/xen-asm_32.S Remove open-coded exception table entries in arch/x86/xen/xen-asm_32.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/xen/xen-asm_32.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index b040b0e518ca..f9643fc50de5 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -137,10 +138,7 @@ iret_restore_end: 1: iret xen_iret_end_crit: -.section __ex_table, "a" - .align 4 - .long 1b, iret_exc -.previous + _ASM_EXTABLE(1b, iret_exc) hyper_iret: /* put this out of line since its very rarely used */ -- cgit v1.2.3 From 447657e31235c692f579c639250317c7f565cd0d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 12:20:30 -0700 Subject: x86, extable: Remove the now-unused __ASM_EX_SEC macros Nothing should use them anymore; only _ASM_EXTABLE() should ever be used. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/include/asm/asm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index ff3f6bffcbf9..53dce41f2517 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -4,11 +4,9 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x # define __ASM_FORM_COMMA(x) x, -# define __ASM_EX_SEC .section __ex_table, "a" #else # define __ASM_FORM(x) " " #x " " # define __ASM_FORM_COMMA(x) " " #x "," -# define __ASM_EX_SEC " .section __ex_table,\"a\"\n" #endif #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 3ee89722cfb165295cc8eb498018c0bdafc57062 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 13:41:59 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/include/asm/kvm_host.h Remove open-coded exception table entries in arch/x86/include/asm/kvm_host.h, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Cc: Avi Kivity Cc: Marcelo Tosatti Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/include/asm/kvm_host.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e216ba066e79..e5b97be12d2a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include #include +#include #define KVM_MAX_VCPUS 254 #define KVM_SOFT_MAX_VCPUS 160 @@ -921,9 +922,7 @@ extern bool kvm_rebooting; __ASM_SIZE(push) " $666b \n\t" \ "call kvm_spurious_fault \n\t" \ ".popsection \n\t" \ - ".pushsection __ex_table, \"a\" \n\t" \ - _ASM_PTR " 666b, 667b \n\t" \ - ".popsection" + _ASM_EXTABLE(666b, 667b) #define __kvm_handle_fault_on_reboot(insn) \ ____kvm_handle_fault_on_reboot(insn, "") -- cgit v1.2.3 From 7a040a4384c7c4973deb4d58a76e1b0ee3c8aa39 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 13:42:25 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/include/asm/xsave.h Remove open-coded exception table entries in arch/x86/include/asm/xsave.h, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. Signed-off-by: H. Peter Anvin Cc: David Daney Cc: Suresh Siddha Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/include/asm/xsave.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index c6ce2452f10c..8a1b6f9b594a 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -80,10 +80,7 @@ static inline int xsave_user(struct xsave_struct __user *buf) "3: movl $-1,%[err]\n" " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - _ASM_ALIGN "\n" - _ASM_PTR "1b,3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : [err] "=r" (err) : "D" (buf), "a" (-1), "d" (-1), "0" (0) : "memory"); @@ -106,10 +103,7 @@ static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask) "3: movl $-1,%[err]\n" " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - _ASM_ALIGN "\n" - _ASM_PTR "1b,3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : [err] "=r" (err) : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0) : "memory"); /* memory required? */ -- cgit v1.2.3 From 8571723a698dcc0ee16c1c63908aa99dd940ce5c Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Fri, 20 Apr 2012 16:02:05 -0700 Subject: x86/mce Add validation check before GHES error is recorded When GHES error record is logged into mcelog kernel buffer, a validation check for physical address is necessary, which prevents reporting an invalid physical address. [Since physical address is the only useful element in this error record, we drop generating the record completely if we don't have a valid address] Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 507ea58688e2..cd8b166a1735 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) struct mce m; /* Only corrected MC is reported */ - if (!corrected) + if (!corrected || !(mem_err->validation_bits & + CPER_MEM_VALID_PHYSICAL_ADDRESS)) return; mce_setup(&m); -- cgit v1.2.3 From a3e859fed1244b72253718e076a724ffe13a9584 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 16:51:50 -0700 Subject: x86, extable: Remove open-coded exception table entries in arch/x86/ia32/ia32entry.S Remove open-coded exception table entries in arch/x86/ia32/ia32entry.S, and replace them with _ASM_EXTABLE() macros; this will allow us to change the format and type of the exception table entries. This one was missed from the previous patch to this file. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/ia32/ia32entry.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index eb48edd0cad2..20e5f7ba0e6b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -302,9 +302,7 @@ ENTRY(ia32_cstar_target) 32bit zero extended */ /* hardware stack frame is complete now */ 1: movl (%r8),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous + _ASM_EXTABLE(1b,ia32_badarg) orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE -- cgit v1.2.3 From 535c0c34698061544f81a51c65fc51f4eeeebff6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 16:57:35 -0700 Subject: x86, extable: Add _ASM_EXTABLE_EX() macro Add _ASM_EXTABLE_EX() to generate the special extable entries that are associated with uaccess_err. This allows us to change the protocol associated with these special entries. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/include/asm/asm.h | 28 ++++++++++++++++++++-------- arch/x86/include/asm/uaccess.h | 8 ++++---- 2 files changed, 24 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 53dce41f2517..0f15e8a4f565 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -40,16 +40,28 @@ /* Exception table entry */ #ifdef __ASSEMBLY__ -# define _ASM_EXTABLE(from,to) \ - .pushsection "__ex_table","a" ; \ - _ASM_ALIGN ; \ - _ASM_PTR from , to ; \ +# define _ASM_EXTABLE(from,to) \ + .pushsection "__ex_table","a" ; \ + _ASM_ALIGN ; \ + _ASM_PTR from , to ; \ + .popsection + +# define _ASM_EXTABLE_EX(from,to) \ + .pushsection "__ex_table","a" ; \ + _ASM_ALIGN ; \ + _ASM_PTR from , (to) - (from) ; \ .popsection #else -# define _ASM_EXTABLE(from,to) \ - " .pushsection \"__ex_table\",\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR #from "," #to "\n" \ +# define _ASM_EXTABLE(from,to) \ + " .pushsection \"__ex_table\",\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .popsection\n" + +# define _ASM_EXTABLE_EX(from,to) \ + " .pushsection \"__ex_table\",\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from ",(" #to ")-(" #from ")\n" \ " .popsection\n" #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index e0544597cfe7..4ee59dd66f5d 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -202,8 +202,8 @@ extern int __get_user_bad(void); asm volatile("1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ "3:\n" \ - _ASM_EXTABLE(1b, 2b - 1b) \ - _ASM_EXTABLE(2b, 3b - 2b) \ + _ASM_EXTABLE_EX(1b, 2b) \ + _ASM_EXTABLE_EX(2b, 3b) \ : : "A" (x), "r" (addr)) #define __put_user_x8(x, ptr, __ret_pu) \ @@ -408,7 +408,7 @@ do { \ #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ asm volatile("1: mov"itype" %1,%"rtype"0\n" \ "2:\n" \ - _ASM_EXTABLE(1b, 2b - 1b) \ + _ASM_EXTABLE_EX(1b, 2b) \ : ltype(x) : "m" (__m(addr))) #define __put_user_nocheck(x, ptr, size) \ @@ -450,7 +450,7 @@ struct __large_struct { unsigned long buf[100]; }; #define __put_user_asm_ex(x, addr, itype, rtype, ltype) \ asm volatile("1: mov"itype" %"rtype"0,%1\n" \ "2:\n" \ - _ASM_EXTABLE(1b, 2b - 1b) \ + _ASM_EXTABLE_EX(1b, 2b) \ : : ltype(x), "m" (__m(addr))) /* -- cgit v1.2.3 From fa574a48a1e9706bba38188d3bf61ecb66546a77 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 17:11:17 -0700 Subject: x86, extable: Disable presorted exception table for now Disable presorting the exception table in preparation for changing the format. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2f925ccb3e5b..1d14cc6b79ad 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,7 +82,6 @@ config X86 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC - select BUILDTIME_EXTABLE_SORT config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) -- cgit v1.2.3 From 706276543b699d80f546e45f8b12574e7b18d952 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Apr 2012 17:12:48 -0700 Subject: x86, extable: Switch to relative exception table entries Switch to using relative exception table entries on x86. On i386, this has the advantage that the exception table entries don't need to be relocated; on x86-64 this means the exception table entries take up only half the space. In either case, a 32-bit delta is sufficient, as the range of kernel code addresses is limited. Since part of the goal is to avoid needing to adjust the entries when the kernel is relocated, the old trick of using addresses in the NULL pointer range to indicate uaccess_err no longer works (and unlike RISC architectures we can't use a flag bit); instead use an delta just below +2G to indicate these special entries. The reach is still limited to a single instruction. Signed-off-by: H. Peter Anvin Cc: David Daney Link: http://lkml.kernel.org/r/CA%2B55aFyijf43qSu3N9nWHEBwaGbb7T2Oq9A=9EyR=Jtyqfq_cQ@mail.gmail.com --- arch/x86/include/asm/asm.h | 20 ++++--- arch/x86/include/asm/uaccess.h | 17 ++++-- arch/x86/mm/extable.c | 131 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 146 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 0f15e8a4f565..1c2d247f65ce 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -42,26 +42,30 @@ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE(from,to) \ .pushsection "__ex_table","a" ; \ - _ASM_ALIGN ; \ - _ASM_PTR from , to ; \ + .balign 8 ; \ + .long (from) - . ; \ + .long (to) - . ; \ .popsection # define _ASM_EXTABLE_EX(from,to) \ .pushsection "__ex_table","a" ; \ - _ASM_ALIGN ; \ - _ASM_PTR from , (to) - (from) ; \ + .balign 8 ; \ + .long (from) - . ; \ + .long (to) - . + 0x7ffffff0 ; \ .popsection #else # define _ASM_EXTABLE(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR #from "," #to "\n" \ + " .balign 8\n" \ + " .long (" #from ") - .\n" \ + " .long (" #to ") - .\n" \ " .popsection\n" # define _ASM_EXTABLE_EX(from,to) \ " .pushsection \"__ex_table\",\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR #from ",(" #to ")-(" #from ")\n" \ + " .balign 8\n" \ + " .long (" #from ") - .\n" \ + " .long (" #to ") - . + 0x7ffffff0\n" \ " .popsection\n" #endif diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 4ee59dd66f5d..851fe0dc13bc 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -79,11 +79,12 @@ #define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) /* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. + * The exception table consists of pairs of addresses relative to the + * exception table enty itself: the first is the address of an + * instruction that is allowed to fault, and the second is the address + * at which the program should continue. No registers are modified, + * so it is entirely up to the continuation code to figure out what to + * do. * * All the routines below use bits of fixup code that are out of line * with the main instruction path. This means when everything is well, @@ -92,10 +93,14 @@ */ struct exception_table_entry { - unsigned long insn, fixup; + int insn, fixup; }; +/* This is not the generic standard exception_table_entry format */ +#define ARCH_HAS_SORT_EXTABLE +#define ARCH_HAS_SEARCH_EXTABLE extern int fixup_exception(struct pt_regs *regs); +extern int early_fixup_exception(unsigned long *ip); /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 5555675dadb6..903ec1e9c326 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,11 +1,23 @@ #include #include +#include #include +static inline unsigned long +ex_insn_addr(const struct exception_table_entry *x) +{ + return (unsigned long)&x->insn + x->insn; +} +static inline unsigned long +ex_fixup_addr(const struct exception_table_entry *x) +{ + return (unsigned long)&x->fixup + x->fixup; +} int fixup_exception(struct pt_regs *regs) { const struct exception_table_entry *fixup; + unsigned long new_ip; #ifdef CONFIG_PNPBIOS if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { @@ -23,13 +35,14 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(regs->ip); if (fixup) { - /* If fixup is less than 16, it means uaccess error */ - if (fixup->fixup < 16) { + new_ip = ex_fixup_addr(fixup); + + if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { + /* Special hack for uaccess_err */ current_thread_info()->uaccess_err = 1; - regs->ip += fixup->fixup; - return 1; + new_ip -= 0x7ffffff0; } - regs->ip = fixup->fixup; + regs->ip = new_ip; return 1; } @@ -40,15 +53,117 @@ int fixup_exception(struct pt_regs *regs) int __init early_fixup_exception(unsigned long *ip) { const struct exception_table_entry *fixup; + unsigned long new_ip; fixup = search_exception_tables(*ip); if (fixup) { - if (fixup->fixup < 16) - return 0; /* Not supported during early boot */ + new_ip = ex_fixup_addr(fixup); + + if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) { + /* uaccess handling not supported during early boot */ + return 0; + } - *ip = fixup->fixup; + *ip = new_ip; return 1; } return 0; } + +/* + * Search one exception table for an entry corresponding to the + * given instruction address, and return the address of the entry, + * or NULL if none is found. + * We use a binary search, and thus we assume that the table is + * already sorted. + */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + while (first <= last) { + const struct exception_table_entry *mid; + unsigned long addr; + + mid = ((last - first) >> 1) + first; + addr = ex_insn_addr(mid); + if (addr < value) + first = mid + 1; + else if (addr > value) + last = mid - 1; + else + return mid; + } + return NULL; +} + +/* + * The exception table needs to be sorted so that the binary + * search that we use to find entries in it works properly. + * This is used both for the kernel exception table and for + * the exception tables of modules that get loaded. + * + */ +static int cmp_ex(const void *a, const void *b) +{ + const struct exception_table_entry *x = a, *y = b; + + /* + * This value will always end up fittin in an int, because on + * both i386 and x86-64 the kernel symbol-reachable address + * space is < 2 GiB. + * + * This compare is only valid after normalization. + */ + return x->insn - y->insn; +} + +void sort_extable(struct exception_table_entry *start, + struct exception_table_entry *finish) +{ + struct exception_table_entry *p; + int i; + + /* Convert all entries to being relative to the start of the section */ + i = 0; + for (p = start; p < finish; p++) { + p->insn += i; + i += 4; + p->fixup += i; + i += 4; + } + + sort(start, finish - start, sizeof(struct exception_table_entry), + cmp_ex, NULL); + + /* Denormalize all entries */ + i = 0; + for (p = start; p < finish; p++) { + p->insn -= i; + i += 4; + p->fixup -= i; + i += 4; + } +} + +#ifdef CONFIG_MODULES +/* + * If the exception table is sorted, any referring to the module init + * will be at the beginning or the end. + */ +void trim_init_extable(struct module *m) +{ + /*trim the beginning*/ + while (m->num_exentries && + within_module_init(ex_insn_addr(&m->extable[0]), m)) { + m->extable++; + m->num_exentries--; + } + /*trim the end*/ + while (m->num_exentries && + within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m)) + m->num_exentries--; +} +#endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 88674088d10ca2538b2efd2559f6620ade8ec373 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 16 Apr 2012 16:26:04 -0400 Subject: x86: Use vga_default_device() when determining whether an fb is primary IORESOURCE_ROM_SHADOW is not necessarily an indication that the hardware is the primary device. Add support for using the vgaarb functions and fall back if nothing's set them. Signed-off-by: Matthew Garrett Cc: mingo@redhat.com Acked-by: hpa@zytor.com Signed-off-by: Dave Airlie --- arch/x86/video/fbdev.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index c5ffb6ac8707..d5644bbe8cba 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -9,24 +9,34 @@ #include #include #include +#include int fb_is_primary_device(struct fb_info *info) { struct device *device = info->device; struct pci_dev *pci_dev = NULL; + struct pci_dev *default_device = vga_default_device(); struct resource *res = NULL; - int retval = 0; if (device) pci_dev = to_pci_dev(device); - if (pci_dev) - res = &pci_dev->resource[PCI_ROM_RESOURCE]; + if (!pci_dev) + return 0; + + if (default_device) { + if (pci_dev == default_device) + return 1; + else + return 0; + } + + res = &pci_dev->resource[PCI_ROM_RESOURCE]; if (res && res->flags & IORESOURCE_ROM_SHADOW) - retval = 1; + return 1; - return retval; + return 0; } EXPORT_SYMBOL(fb_is_primary_device); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From b4aa0163056b6c70029b6e8619ce07c274351f42 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 16 Apr 2012 16:26:05 -0400 Subject: efifb: Implement vga_default_device() (v2) EFI doesn't typically make use of the legacy VGA ROM, but it may still be configured to pass that through to a given video device. This may lead to an inaccurate choice of default video device. Add support to efifb to pick out the correct active video device. v2: fix if->ifdef Signed-off-by: Matthew Garrett Acked-by: hpa@zytor.com Cc: matt.fleming@intel.com Signed-off-by: Dave Airlie --- arch/x86/include/asm/vga.h | 6 ++++ drivers/video/efifb.c | 77 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h index c4b9dc2f67c5..44282fbf7bf9 100644 --- a/arch/x86/include/asm/vga.h +++ b/arch/x86/include/asm/vga.h @@ -17,4 +17,10 @@ #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) +#ifdef CONFIG_FB_EFI +#define __ARCH_HAS_VGA_DEFAULT_DEVICE +extern struct pci_dev *vga_default_device(void); +extern void vga_set_default_device(struct pci_dev *pdev); +#endif + #endif /* _ASM_X86_VGA_H */ diff --git a/drivers/video/efifb.c b/drivers/video/efifb.c index 784139aed079..66ed991ed8ba 100644 --- a/drivers/video/efifb.c +++ b/drivers/video/efifb.c @@ -18,6 +18,8 @@ static bool request_mem_succeeded = false; +static struct pci_dev *default_vga; + static struct fb_var_screeninfo efifb_defined __devinitdata = { .activate = FB_ACTIVATE_NOW, .height = -1, @@ -298,35 +300,70 @@ static struct fb_ops efifb_ops = { .fb_imageblit = cfb_imageblit, }; +struct pci_dev *vga_default_device(void) +{ + return default_vga; +} + +void vga_set_default_device(struct pci_dev *pdev) +{ + default_vga = pdev; +} + static int __init efifb_setup(char *options) { char *this_opt; int i; + struct pci_dev *dev = NULL; + + if (options && *options) { + while ((this_opt = strsep(&options, ",")) != NULL) { + if (!*this_opt) continue; + + for (i = 0; i < M_UNKNOWN; i++) { + if (!strcmp(this_opt, dmi_list[i].optname) && + dmi_list[i].base != 0) { + screen_info.lfb_base = dmi_list[i].base; + screen_info.lfb_linelength = dmi_list[i].stride; + screen_info.lfb_width = dmi_list[i].width; + screen_info.lfb_height = dmi_list[i].height; + } + } + if (!strncmp(this_opt, "base:", 5)) + screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0); + else if (!strncmp(this_opt, "stride:", 7)) + screen_info.lfb_linelength = simple_strtoul(this_opt+7, NULL, 0) * 4; + else if (!strncmp(this_opt, "height:", 7)) + screen_info.lfb_height = simple_strtoul(this_opt+7, NULL, 0); + else if (!strncmp(this_opt, "width:", 6)) + screen_info.lfb_width = simple_strtoul(this_opt+6, NULL, 0); + } + } - if (!options || !*options) - return 0; + for_each_pci_dev(dev) { + int i; - while ((this_opt = strsep(&options, ",")) != NULL) { - if (!*this_opt) continue; + if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + continue; - for (i = 0; i < M_UNKNOWN; i++) { - if (!strcmp(this_opt, dmi_list[i].optname) && - dmi_list[i].base != 0) { - screen_info.lfb_base = dmi_list[i].base; - screen_info.lfb_linelength = dmi_list[i].stride; - screen_info.lfb_width = dmi_list[i].width; - screen_info.lfb_height = dmi_list[i].height; - } + for (i=0; i < DEVICE_COUNT_RESOURCE; i++) { + resource_size_t start, end; + + if (!(pci_resource_flags(dev, i) & IORESOURCE_MEM)) + continue; + + start = pci_resource_start(dev, i); + end = pci_resource_end(dev, i); + + if (!start || !end) + continue; + + if (screen_info.lfb_base >= start && + (screen_info.lfb_base + screen_info.lfb_size) < end) + default_vga = dev; } - if (!strncmp(this_opt, "base:", 5)) - screen_info.lfb_base = simple_strtoul(this_opt+5, NULL, 0); - else if (!strncmp(this_opt, "stride:", 7)) - screen_info.lfb_linelength = simple_strtoul(this_opt+7, NULL, 0) * 4; - else if (!strncmp(this_opt, "height:", 7)) - screen_info.lfb_height = simple_strtoul(this_opt+7, NULL, 0); - else if (!strncmp(this_opt, "width:", 6)) - screen_info.lfb_width = simple_strtoul(this_opt+6, NULL, 0); } + return 0; } -- cgit v1.2.3 From 07975ad3b30579ca27d880491ad992326b930c63 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 29 Mar 2012 21:14:12 +0200 Subject: KVM: Introduce direct MSI message injection for in-kernel irqchips Currently, MSI messages can only be injected to in-kernel irqchips by defining a corresponding IRQ route for each message. This is not only unhandy if the MSI messages are generated "on the fly" by user space, IRQ routes are a limited resource that user space has to manage carefully. By providing a direct injection path, we can both avoid using up limited resources and simplify the necessary steps for user land. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 21 +++++++++++++++++++++ arch/x86/kvm/Kconfig | 1 + include/linux/kvm.h | 11 +++++++++++ include/linux/kvm_host.h | 2 ++ virt/kvm/Kconfig | 3 +++ virt/kvm/irq_comm.c | 14 ++++++++++++++ virt/kvm/kvm_main.c | 14 ++++++++++++++ 7 files changed, 66 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 81ff39f6248d..a1552210b16d 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1689,6 +1689,27 @@ where the guest will clear the flag: when the soft lockup watchdog timer resets itself or when a soft lockup is detected. This ioctl can be called any time after pausing the vcpu, but before it is resumed. +4.71 KVM_SIGNAL_MSI + +Capability: KVM_CAP_SIGNAL_MSI +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_msi (in) +Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error + +Directly inject a MSI message. Only valid with in-kernel irqchip that handles +MSI messages. + +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u8 pad[16]; +}; + +No flags are defined so far. The corresponding field must be 0. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 1a7fe868f375..a28f338843ea 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select TASKSTATS select TASK_DELAY_ACCT select PERF_EVENTS + select HAVE_KVM_MSI ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 7a9dd4b3dede..225b452e1d1d 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -590,6 +590,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_SYNC_REGS 74 #define KVM_CAP_PCI_2_3 75 #define KVM_CAP_KVMCLOCK_CTRL 76 +#define KVM_CAP_SIGNAL_MSI 77 #ifdef KVM_CAP_IRQ_ROUTING @@ -715,6 +716,14 @@ struct kvm_one_reg { __u64 addr; }; +struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u8 pad[16]; +}; + /* * ioctls for VM fds */ @@ -789,6 +798,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_PCI_2_3 */ #define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ struct kvm_assigned_pci_dev) +/* Available with KVM_CAP_SIGNAL_MSI */ +#define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) /* * ioctls for vcpu fds diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 186ffab0b9f0..6f343307d72b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -802,6 +802,8 @@ int kvm_set_irq_routing(struct kvm *kvm, unsigned flags); void kvm_free_irq_routing(struct kvm *kvm); +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); + #else static inline void kvm_free_irq_routing(struct kvm *kvm) {} diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index f63ccb0a5982..28694f4a9139 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -18,3 +18,6 @@ config KVM_MMIO config KVM_ASYNC_PF bool + +config HAVE_KVM_MSI + bool diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 9f614b4e365f..a6a0365475ed 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -138,6 +138,20 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, return kvm_irq_delivery_to_apic(kvm, NULL, &irq); } +int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + struct kvm_kernel_irq_routing_entry route; + + if (!irqchip_in_kernel(kvm) || msi->flags != 0) + return -EINVAL; + + route.msi.address_lo = msi->address_lo; + route.msi.address_hi = msi->address_hi; + route.msi.data = msi->data; + + return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1); +} + /* * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9eb7936e491d..1847c762d8d9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2059,6 +2059,17 @@ static long kvm_vm_ioctl(struct file *filp, kvm->bsp_vcpu_id = arg; mutex_unlock(&kvm->lock); break; +#endif +#ifdef CONFIG_HAVE_KVM_MSI + case KVM_SIGNAL_MSI: { + struct kvm_msi msi; + + r = -EFAULT; + if (copy_from_user(&msi, argp, sizeof msi)) + goto out; + r = kvm_send_userspace_msi(kvm, &msi); + break; + } #endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); @@ -2188,6 +2199,9 @@ static long kvm_dev_ioctl_check_extension_generic(long arg) case KVM_CAP_SET_BOOT_CPU_ID: #endif case KVM_CAP_INTERNAL_ERROR_DATA: +#ifdef CONFIG_HAVE_KVM_MSI + case KVM_CAP_SIGNAL_MSI: +#endif return 1; #ifdef CONFIG_HAVE_KVM_IRQCHIP case KVM_CAP_IRQ_ROUTING: -- cgit v1.2.3 From 413837714232b3a4c0705e915d8af75ad521d083 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 19 Apr 2012 14:06:29 +0300 Subject: KVM: Introduce bitmask for apic attention reasons The patch introduces a bitmap that will hold reasons apic should be checked during vmexit. This is in a preparation for vp eoi patch that will add one more check on vmexit. With the bitmap we can do if(apic_attention) to check everything simultaneously which will add zero overhead on the fast path. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/lapic.c | 12 +++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f624ca72ea24..69e39bc7e36f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -172,6 +172,9 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff23ff +/* apic attention bits */ +#define KVM_APIC_CHECK_VAPIC 0 + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -337,6 +340,7 @@ struct kvm_vcpu_arch { u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; int sipi_vector; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 992b4eaae684..93c15743f1ee 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1088,6 +1088,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; + vcpu->arch.apic_attention = 0; apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, @@ -1287,7 +1288,7 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) u32 data; void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; vapic = kmap_atomic(vcpu->arch.apic->vapic_page); @@ -1304,7 +1305,7 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) struct kvm_lapic *apic; void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; apic = vcpu->arch.apic; @@ -1324,10 +1325,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { - if (!irqchip_in_kernel(vcpu->kvm)) - return; - vcpu->arch.apic->vapic_addr = vapic_addr; + if (vapic_addr) + __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + else + __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) -- cgit v1.2.3 From 38e8a2ddc9ada5dd1f2def95bebb733bf619bbef Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Apr 2012 15:12:50 +0300 Subject: KVM: x86 emulator: fix asm constraint in flush_pending_x87_faults 'bool' wants 8-bit registers. Reported-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d5729a91d08d..0d151e232480 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4123,7 +4123,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) "jmp 2b \n\t" ".popsection \n\t" _ASM_EXTABLE(1b, 3b) - : [fault]"+rm"(fault)); + : [fault]"+qm"(fault)); ctxt->ops->put_fpu(ctxt); if (unlikely(fault)) -- cgit v1.2.3 From 8b5ad472991796b2347464922c72de2ca5a028f3 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 24 Apr 2012 11:23:15 -0700 Subject: Revert "x86, extable: Disable presorted exception table for now" sortextable now works with relative entries, re-enable it. Signed-off-by: David Daney Link: http://lkml.kernel.org/r/1335291795-26693-3-git-send-email-ddaney.cavm@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..2f925ccb3e5b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,7 @@ config X86 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC + select BUILDTIME_EXTABLE_SORT config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) -- cgit v1.2.3 From 553222f3e81f18da31b2552e18dc519715198590 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 29 Mar 2012 16:11:16 -0400 Subject: x86/nmi: Add new NMI queues to deal with IO_CHK and SERR In discussions with Thomas Mingarelli about hpwdt, he explained to me some issues they were some when using their virtual NMI button to test the hpwdt driver. It turns out the virtual NMI button used on HP's machines do no send unknown NMIs but instead send IO_CHK NMIs. The way the kernel code is written, the hpwdt driver can not register itself against that type of NMI and therefore can not successfully capture system information before panic'ing. To solve this I created two new NMI queues to allow driver to register against the IO_CHK and SERR NMIs. Or in the hpwdt all three (if you include unknown NMIs too). The change is straightforward and just mimics what the unknown NMI does. Reported-and-tested-by: Thomas Mingarelli Signed-off-by: Don Zickus Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1333051877-15755-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/nmi.c | 18 ++++++++++++++++++ drivers/watchdog/hpwdt.c | 27 ++++++++++++++++++++------- 3 files changed, 40 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index fd3f9f18cf3f..07162dfbff84 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -27,6 +27,8 @@ void arch_trigger_all_cpu_backtrace(void); enum { NMI_LOCAL=0, NMI_UNKNOWN, + NMI_SERR, + NMI_IO_CHECK, NMI_MAX }; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf319165..ac9c1b76df96 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -54,6 +54,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] = .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), .head = LIST_HEAD_INIT(nmi_desc[1].head), }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .head = LIST_HEAD_INIT(nmi_desc[2].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .head = LIST_HEAD_INIT(nmi_desc[3].head), + }, }; @@ -120,6 +128,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) * to manage expectations */ WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); /* * some handlers need to be executed first otherwise a fake @@ -212,6 +222,10 @@ EXPORT_SYMBOL_GPL(unregister_nmi_handler); static notrace __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_SERR, regs, false)) + return; + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -241,6 +255,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_IO_CHECK, regs, false)) + return; + pr_emerg( "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", reason, smp_processor_id()); diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 4000b8038cac..6e414b501d58 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -725,19 +725,32 @@ static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev) * Only one function can register for NMI_UNKNOWN */ retval = register_nmi_handler(NMI_UNKNOWN, hpwdt_pretimeout, 0, "hpwdt"); - if (retval != 0) { - dev_warn(&dev->dev, - "Unable to register a die notifier (err=%d).\n", - retval); - if (cru_rom_addr) - iounmap(cru_rom_addr); - } + if (retval) + goto error; + retval = register_nmi_handler(NMI_SERR, hpwdt_pretimeout, 0, "hpwdt"); + if (retval) + goto error1; + retval = register_nmi_handler(NMI_IO_CHECK, hpwdt_pretimeout, 0, "hpwdt"); + if (retval) + goto error2; dev_info(&dev->dev, "HP Watchdog Timer Driver: NMI decoding initialized" ", allow kernel dump: %s (default = 0/OFF)\n", (allow_kdump == 0) ? "OFF" : "ON"); return 0; + +error2: + unregister_nmi_handler(NMI_SERR, "hpwdt"); +error1: + unregister_nmi_handler(NMI_UNKNOWN, "hpwdt"); +error: + dev_warn(&dev->dev, + "Unable to register a die notifier (err=%d).\n", + retval); + if (cru_rom_addr) + iounmap(cru_rom_addr); + return retval; } static void hpwdt_exit_nmi_decoding(void) -- cgit v1.2.3 From 72b3fb24713755cf9740b403e95aa67ceedf3509 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Thu, 29 Mar 2012 16:11:17 -0400 Subject: x86/nmi: Fix page faults by nmiaction if kmemcheck is enabled This patch tries to fix the problem of page fault exception caused by accessing nmiaction structure in nmi if kmemcheck is enabled. If kmemcheck is enabled, the memory allocated through slab are in pages that are marked non-present, so that some checks could be done in the page fault handling code ( e.g. whether the memory is read before written to ). As nmiaction is allocated in this way, so it resides in a non-present page. Then there is a page fault while the nmi code accessing the nmiaction structure, which would then cause a warning by WARN_ON_ONCE(in_nmi()) in kmemcheck_fault(), called by do_page_fault(). This significantly simplifies the code as well, as the whole dynamic allocation dance goes away. v2: as Peter suggested, changed the nmiaction to use static storage. v3: as Peter suggested, use macro to shorten the codes. Also keep the original usage of register_nmi_handler, so users of this call doesn't need change. Tested-by: Seiji Aguchi Fixes: https://lkml.org/lkml/2012/3/2/356 Signed-off-by: Li Zhong [ simplified the wrappers ] Signed-off-by: Don Zickus Cc: Peter Zijlstra Cc: thomas.mingarelli@hp.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1333051877-15755-4-git-send-email-dzickus@redhat.com [ tidied the patch a bit ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 20 ++++++++++++-- arch/x86/kernel/nmi.c | 65 +++++----------------------------------------- 2 files changed, 24 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 07162dfbff84..a1a836c8131c 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -37,8 +37,24 @@ enum { typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *); -int register_nmi_handler(unsigned int, nmi_handler_t, unsigned long, - const char *); +struct nmiaction { + struct list_head list; + nmi_handler_t handler; + unsigned int flags; + const char *name; +}; + +#define register_nmi_handler(t, fn, fg, n) \ +({ \ + static struct nmiaction fn##_na = { \ + .handler = (fn), \ + .name = (n), \ + .flags = (fg), \ + }; \ + __register_nmi_handler((t), &fn##_na); \ +}) + +int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ac9c1b76df96..585be4bd71a5 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -31,14 +31,6 @@ #include #include -#define NMI_MAX_NAMELEN 16 -struct nmiaction { - struct list_head list; - nmi_handler_t handler; - unsigned int flags; - char *name; -}; - struct nmi_desc { spinlock_t lock; struct list_head head; @@ -115,11 +107,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, return handled; } -static int __setup_nmi(unsigned int type, struct nmiaction *action) +int __register_nmi_handler(unsigned int type, struct nmiaction *action) { struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; + if (!action->handler) + return -EINVAL; + spin_lock_irqsave(&desc->lock, flags); /* @@ -143,8 +138,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action) spin_unlock_irqrestore(&desc->lock, flags); return 0; } +EXPORT_SYMBOL(__register_nmi_handler); -static struct nmiaction *__free_nmi(unsigned int type, const char *name) +void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *n; @@ -167,56 +163,7 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name) spin_unlock_irqrestore(&desc->lock, flags); synchronize_rcu(); - return (n); } - -int register_nmi_handler(unsigned int type, nmi_handler_t handler, - unsigned long nmiflags, const char *devname) -{ - struct nmiaction *action; - int retval = -ENOMEM; - - if (!handler) - return -EINVAL; - - action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL); - if (!action) - goto fail_action; - - action->handler = handler; - action->flags = nmiflags; - action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL); - if (!action->name) - goto fail_action_name; - - retval = __setup_nmi(type, action); - - if (retval) - goto fail_setup_nmi; - - return retval; - -fail_setup_nmi: - kfree(action->name); -fail_action_name: - kfree(action); -fail_action: - - return retval; -} -EXPORT_SYMBOL_GPL(register_nmi_handler); - -void unregister_nmi_handler(unsigned int type, const char *name) -{ - struct nmiaction *a; - - a = __free_nmi(type, name); - if (a) { - kfree(a->name); - kfree(a); - } -} - EXPORT_SYMBOL_GPL(unregister_nmi_handler); static notrace __kprobes void -- cgit v1.2.3 From fab06992de6433af097c4a1d2d1b119812753ca7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Apr 2012 12:55:22 +0200 Subject: perf/x86: Clean up register_nmi_handler() usage A function name represents the pointer to it - no need to take the address of it. (Fixing this helps us introduce some macro magic around register_nmi_handler() in the future.) Cc: Robert Richter Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 573d24873459..8ff74d439041 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -469,7 +469,7 @@ static __init int perf_event_ibs_init(void) perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); - register_nmi_handler(NMI_LOCAL, &perf_ibs_nmi_handler, 0, "perf_ibs"); + register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); return 0; -- cgit v1.2.3 From 8239c25f47d2b318156993b15f33900a86ea5e17 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 Apr 2012 13:05:42 +0000 Subject: smp: Add task_struct argument to __cpu_up() Preparatory patch to make the idle thread allocation for secondary cpus generic. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Rusty Russell Cc: Paul E. McKenney Cc: Srivatsa S. Bhat Cc: Matt Turner Cc: Russell King Cc: Mike Frysinger Cc: Jesper Nilsson Cc: Richard Kuo Cc: Tony Luck Cc: Hirokazu Takata Cc: Ralf Baechle Cc: David Howells Cc: James E.J. Bottomley Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: David S. Miller Cc: Chris Metcalf Cc: Richard Weinberger Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20120420124556.964170564@linutronix.de --- arch/alpha/kernel/smp.c | 2 +- arch/arm/kernel/smp.c | 2 +- arch/blackfin/mach-common/smp.c | 2 +- arch/cris/arch-v32/kernel/smp.c | 2 +- arch/hexagon/kernel/smp.c | 2 +- arch/ia64/kernel/smpboot.c | 2 +- arch/m32r/kernel/smpboot.c | 2 +- arch/mips/kernel/smp.c | 2 +- arch/mn10300/kernel/smp.c | 2 +- arch/parisc/kernel/smp.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/s390/include/asm/smp.h | 2 +- arch/s390/kernel/smp.c | 2 +- arch/sh/kernel/smp.c | 2 +- arch/sparc/kernel/smp_32.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/tile/kernel/smpboot.c | 2 +- arch/um/kernel/smp.c | 2 +- arch/x86/include/asm/smp.h | 4 +++- include/linux/smp.h | 2 +- kernel/cpu.c | 2 +- 21 files changed, 23 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 50d438db1f6b..68d39470fb52 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -487,7 +487,7 @@ smp_prepare_boot_cpu(void) } int __cpuinit -__cpu_up(unsigned int cpu) +__cpu_up(unsigned int cpu, struct task_struct *tidle) { smp_boot_one_cpu(cpu); diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index addbbe8028c2..f0e2cbbd837d 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -60,7 +60,7 @@ enum ipi_msg_type { static DECLARE_COMPLETION(cpu_running); -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { struct cpuinfo_arm *ci = &per_cpu(cpu_data, cpu); struct task_struct *idle = ci->idle; diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index ac8f8a43158c..d0cddd95b0dd 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -340,7 +340,7 @@ void smp_send_stop(void) return; } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { int ret; struct blackfin_cpudata *ci = &per_cpu(cpu_data, cpu); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 0b99df72d2a4..125ee2d7bc87 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -207,7 +207,7 @@ int setup_profiling_timer(unsigned int multiplier) */ unsigned long cache_decay_ticks = 1; -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { smp_boot_one_cpu(cpu); return cpu_online(cpu) ? 0 : -ENOSYS; diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index 1298141874a3..93e77e2b17a8 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -196,7 +196,7 @@ void __cpuinit start_secondary(void) * maintains control until "cpu_online(cpu)" is set. */ -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { struct task_struct *idle; struct thread_info *thread; diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 796f6a5b966a..03e4ef3893c9 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -793,7 +793,7 @@ set_cpu_sibling_map(int cpu) } int __cpuinit -__cpu_up (unsigned int cpu) +__cpu_up(unsigned int cpu, struct task_struct *tidle) { int ret; int sapicid; diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c index 31541c9b7eb6..a2cfc0abb05c 100644 --- a/arch/m32r/kernel/smpboot.c +++ b/arch/m32r/kernel/smpboot.c @@ -343,7 +343,7 @@ static void __init do_boot_cpu(int phys_id) } } -int __cpuinit __cpu_up(unsigned int cpu_id) +int __cpuinit __cpu_up(unsigned int cpu_id, struct task_struct *tidle) { int timeout; diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index ba9376bf52a1..41079b256092 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -209,7 +209,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { struct task_struct *idle; diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 910dddf65e44..c6b40dad0d0b 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -921,7 +921,7 @@ void initialize_secondary(void) * __cpu_up - Set smp_commenced_mask for the nominated CPU * @cpu: The target CPU. */ -int __devinit __cpu_up(unsigned int cpu) +int __devinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { int timeout; diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 0bb1d63907f8..eae8cd808f07 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -449,7 +449,7 @@ void smp_cpus_done(unsigned int cpu_max) } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { if (cpu != 0 && cpu < parisc_max_cpus) smp_boot_one_cpu(cpu); diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index d9f94410fd7f..d38030fb3471 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -482,7 +482,7 @@ static int __cpuinit create_idle(unsigned int cpu) return 0; } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc, c; diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index c77c6de6f6c0..0b6f586c1383 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -16,7 +16,7 @@ extern struct mutex smp_cpu_state_mutex; extern struct save_area *zfcpdump_save_areas[NR_CPUS + 1]; -extern int __cpu_up(unsigned int cpu); +extern int __cpu_up(unsigned int cpu, struct task_struct *tidle); extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1f77227669e8..fc827aa8f9ca 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -738,7 +738,7 @@ static void __cpuinit smp_fork_idle(struct work_struct *work) } /* Upping and downing of CPUs */ -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { struct create_idle c_idle; struct pcpu *pcpu; diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index eaebdf6a5c77..ebb76e2a748b 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -220,7 +220,7 @@ extern struct { void *thread_info; } stack_start; -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { struct task_struct *tsk; unsigned long timeout; diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index f671e7fd6ddc..1f397ae11028 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -411,7 +411,7 @@ void __init smp_prepare_boot_cpu(void) set_cpu_possible(cpuid, true); } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { extern int __cpuinit smp4m_boot_one_cpu(int); extern int __cpuinit smp4d_boot_one_cpu(int); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3b1bd7c50164..2f9948c4107c 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1227,7 +1227,7 @@ void __devinit smp_fill_in_sib_core_maps(void) } } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { int ret = smp_boot_one_cpu(cpu); diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c index 172aef7d3159..84873fbe8f27 100644 --- a/arch/tile/kernel/smpboot.c +++ b/arch/tile/kernel/smpboot.c @@ -222,7 +222,7 @@ void __cpuinit online_secondary(void) cpu_idle(); } -int __cpuinit __cpu_up(unsigned int cpu) +int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) { /* Wait 5s total for all CPUs for them to come online */ static int timeout; diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 6f588e160fb0..a02b7e9e6b94 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -140,7 +140,7 @@ void smp_prepare_boot_cpu(void) set_cpu_online(smp_processor_id(), true); } -int __cpu_up(unsigned int cpu) +int __cpu_up(unsigned int cpu, struct task_struct *tidle) { cpu_set(cpu, smp_commenced_mask); while (!cpu_online(cpu)) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 0434c400287c..4eb3a74bc4b0 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -62,6 +62,8 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); /* Static state in head.S used to set up a CPU */ extern unsigned long stack_start; /* Initial stack pointer address */ +struct task_struct; + struct smp_ops { void (*smp_prepare_boot_cpu)(void); void (*smp_prepare_cpus)(unsigned max_cpus); @@ -113,7 +115,7 @@ static inline void smp_cpus_done(unsigned int max_cpus) smp_ops.smp_cpus_done(max_cpus); } -static inline int __cpu_up(unsigned int cpu) +static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle) { return smp_ops.cpu_up(cpu); } diff --git a/include/linux/smp.h b/include/linux/smp.h index 10530d92c04b..24360de6c968 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -61,7 +61,7 @@ extern void smp_prepare_cpus(unsigned int max_cpus); /* * Bring a CPU up */ -extern int __cpu_up(unsigned int cpunum); +extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle); /* * Final polishing of CPUs diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e57027..e711aef0fb3c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -309,7 +309,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) } /* Arch-specific enabling code. */ - ret = __cpu_up(cpu); + ret = __cpu_up(cpu, NULL); if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); -- cgit v1.2.3 From 5cdaf1834f43b0edc4a3aa683aa4ec98f6bfe8a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 Apr 2012 13:05:47 +0000 Subject: x86: Add task_struct argument to smp_ops.cpu_up Preparatory patch to use the generic idle thread allocation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Rusty Russell Cc: Paul E. McKenney Cc: Srivatsa S. Bhat Cc: Jeremy Fitzhardinge Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20120420124557.176604405@linutronix.de --- arch/x86/include/asm/smp.h | 6 +++--- arch/x86/kernel/smpboot.c | 2 +- arch/x86/xen/smp.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4eb3a74bc4b0..f3ed33811c23 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -72,7 +72,7 @@ struct smp_ops { void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); - int (*cpu_up)(unsigned cpu); + int (*cpu_up)(unsigned cpu, struct task_struct *tidle); int (*cpu_disable)(void); void (*cpu_die)(unsigned int cpu); void (*play_dead)(void); @@ -117,7 +117,7 @@ static inline void smp_cpus_done(unsigned int max_cpus) static inline int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - return smp_ops.cpu_up(cpu); + return smp_ops.cpu_up(cpu, tidle); } static inline int __cpu_disable(void) @@ -154,7 +154,7 @@ void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); -int native_cpu_up(unsigned int cpunum); +int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); void native_play_dead(void); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6e1e406038c2..def235bf7594 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -818,7 +818,7 @@ do_rest: return boot_error; } -int __cpuinit native_cpu_up(unsigned int cpu) +int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) { int apicid = apic->cpu_present_to_apicid(cpu); unsigned long flags; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 5fac6919b957..64d3bbce0b36 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -331,7 +331,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int __cpuinit xen_cpu_up(unsigned int cpu) +static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *tidle) { struct task_struct *idle = idle_task(cpu); int rc; @@ -547,10 +547,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) xen_init_lock_cpu(0); } -static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) +static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc; - rc = native_cpu_up(cpu); + rc = native_cpu_up(cpu, tidle); WARN_ON (xen_smp_intr_init(cpu)); return rc; } -- cgit v1.2.3 From 7eb43a6d232bfa46464b501cd1987ec2d705d8cf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 20 Apr 2012 13:05:48 +0000 Subject: x86: Use generic idle thread allocation Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Rusty Russell Cc: Paul E. McKenney Cc: Srivatsa S. Bhat Cc: Jeremy Fitzhardinge Cc: x86@kernel.org Link: http://lkml.kernel.org/r/20120420124557.246929343@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 81 ++++++---------------------------------------- arch/x86/xen/smp.c | 15 ++------- 4 files changed, 14 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..046bf4bd2510 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,7 @@ config X86 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC + select GENERIC_SMP_IDLE_THREAD config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f3ed33811c23..f8cbc6f20e31 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -164,6 +164,7 @@ int wbinvd_on_all_cpus(void); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); +void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index def235bf7594..3acaf51dfddb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -76,19 +76,7 @@ /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; -/* Store all idle threads, this can be reused instead of creating -* a new thread. Also avoids complicated thread destroy functionality -* for idle threads. -*/ #ifdef CONFIG_HOTPLUG_CPU -/* - * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is - * removed after init for !CONFIG_HOTPLUG_CPU. - */ -static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); -#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) -#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) - /* * We need this for trampoline_base protection from concurrent accesses when * off- and onlining cores wildly. @@ -97,20 +85,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); void cpu_hotplug_driver_lock(void) { - mutex_lock(&x86_cpu_hotplug_driver_mutex); + mutex_lock(&x86_cpu_hotplug_driver_mutex); } void cpu_hotplug_driver_unlock(void) { - mutex_unlock(&x86_cpu_hotplug_driver_mutex); + mutex_unlock(&x86_cpu_hotplug_driver_mutex); } ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } -#else -static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; -#define get_idle_for_cpu(x) (idle_thread_array[(x)]) -#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif /* Number of siblings per CPU package */ @@ -618,22 +602,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -struct create_idle { - struct work_struct work; - struct task_struct *idle; - struct completion done; - int cpu; -}; - -static void __cpuinit do_fork_idle(struct work_struct *work) -{ - struct create_idle *c_idle = - container_of(work, struct create_idle, work); - - c_idle->idle = fork_idle(c_idle->cpu); - complete(&c_idle->done); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void __cpuinit announce_cpu(int cpu, int apicid) { @@ -660,58 +628,31 @@ static void __cpuinit announce_cpu(int cpu, int apicid) * Returns zero if CPU booted OK, else error code from * ->wakeup_secondary_cpu. */ -static int __cpuinit do_boot_cpu(int apicid, int cpu) +static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { unsigned long boot_error = 0; unsigned long start_ip; int timeout; - struct create_idle c_idle = { - .cpu = cpu, - .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), - }; - - INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); - c_idle.idle = get_idle_for_cpu(cpu); - - /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. - */ - if (c_idle.idle) { - c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); - init_idle(c_idle.idle, cpu); - goto do_rest; - } + idle->thread.sp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + task_stack_page(idle))) - 1); + per_cpu(current_task, cpu) = idle; - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - - if (IS_ERR(c_idle.idle)) { - printk("failed fork for CPU %d\n", cpu); - destroy_work_on_stack(&c_idle.work); - return PTR_ERR(c_idle.idle); - } - - set_idle_for_cpu(cpu, c_idle.idle); -do_rest: - per_cpu(current_task, cpu) = c_idle.idle; #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); #else - clear_tsk_thread_flag(c_idle.idle, TIF_FORK); + clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(c_idle.idle) - + (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = c_idle.idle->thread.sp; + stack_start = idle->thread.sp; /* start_ip had better be page-aligned! */ start_ip = trampoline_address(); @@ -813,8 +754,6 @@ do_rest: */ smpboot_restore_warm_reset_vector(); } - - destroy_work_on_stack(&c_idle.work); return boot_error; } @@ -851,7 +790,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - err = do_boot_cpu(apicid, cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 64d3bbce0b36..8f44cc1a9291 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -250,18 +250,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) set_cpu_possible(cpu, false); } - for_each_possible_cpu (cpu) { - struct task_struct *idle; - - if (cpu == 0) - continue; - - idle = fork_idle(cpu); - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - + for_each_possible_cpu(cpu) set_cpu_present(cpu, true); - } } static int __cpuinit @@ -331,9 +321,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *tidle) +static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle) { - struct task_struct *idle = idle_task(cpu); int rc; per_cpu(current_task, cpu) = idle; -- cgit v1.2.3 From 10c250234c98928d1e15c4cea1c44b9a25354ccf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 5 Apr 2012 18:24:41 +0200 Subject: perf: Trivial cleanup of duplicate code Removing duplicate code. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333643084-26776-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bb8e03407e18..e33e9cf160eb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event) /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; - - /* mark not used */ - event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); -- cgit v1.2.3 From 5f09fc688936705b2020ca247df39ee27283668a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 5 Apr 2012 18:24:42 +0200 Subject: perf/x86: Fix cmpxchg() usage in amd_put_event_constraints() Now the return value of cmpxchg() is used to match an event. The change removes the duplicate event comparison and traverses the list until an event was removed. This also fixes the following warning: arch/x86/kernel/cpu/perf_event_amd.c:170: warning: value computed is not used Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333643084-26776-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 95e7fe1c5f0b..589286f28877 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -205,10 +205,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * when we come here */ for (i = 0; i < x86_pmu.num_counters; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); + if (cmpxchg(nb->owners + i, event, NULL) == event) break; - } } } -- cgit v1.2.3 From b6ddf05ff68d81a7c1736717faf492b70e9bf4f9 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 24 Apr 2012 16:40:17 +0200 Subject: KVM: x86: Run PIT work in own kthread We can't run PIT IRQ injection work in the interrupt context of the host timer. This would allow the user to influence the handler complexity by asking for a broadcast to a large number of VCPUs. Therefore, this work was pushed into workqueue context in 9d244caf2e. However, this prevents prioritizing the PIT injection over other task as workqueues share kernel threads. This replaces the workqueue with a kthread worker and gives that thread a name in the format "kvm-pit/". That allows to identify and adjust the kthread priority according to the VM process parameters. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 8 ++++++++ arch/x86/kvm/i8254.c | 31 +++++++++++++++++++------------ arch/x86/kvm/i8254.h | 7 +++++-- 3 files changed, 32 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a7cb93cb2154..eb62761b7683 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1810,6 +1810,14 @@ Valid flags are: #define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ +PIT timer interrupts may use a per-VM kernel thread for injection. If it +exists, this thread will have a name of the following pattern: + +kvm-pit/ + +When running a guest with elevated priorities, the scheduling parameters of +this thread may have to be adjusted accordingly. + This IOCTL replaces the obsolete KVM_CREATE_PIT. diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index d68f99df690c..adba28f88d1a 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -34,7 +34,6 @@ #include #include -#include #include "irq.h" #include "i8254.h" @@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) /* in this case, we had multiple outstanding pit interrupts * that we needed to inject. Reinject */ - queue_work(ps->pit->wq, &ps->pit->expired); + queue_kthread_work(&ps->pit->worker, &ps->pit->expired); ps->irq_ack = 1; spin_unlock(&ps->inject_lock); } @@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) static void destroy_pit_timer(struct kvm_pit *pit) { hrtimer_cancel(&pit->pit_state.pit_timer.timer); - cancel_work_sync(&pit->expired); + flush_kthread_work(&pit->expired); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; -static void pit_do_work(struct work_struct *work) +static void pit_do_work(struct kthread_work *work) { struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); struct kvm *kvm = pit->kvm; @@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); - queue_work(pt->wq, &pt->expired); + queue_kthread_work(&pt->worker, &pt->expired); } if (ktimer->t_ops->is_periodic(ktimer)) { @@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); - cancel_work_sync(&ps->pit->expired); + flush_kthread_work(&ps->pit->expired); pt->period = interval; ps->is_periodic = is_period; @@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; struct kvm_kpit_state *pit_state; + struct pid *pid; + pid_t pid_nr; int ret; pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); @@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); - pit->wq = create_singlethread_workqueue("kvm-pit-wq"); - if (!pit->wq) { + pid = get_pid(task_tgid(current)); + pid_nr = pid_vnr(pid); + put_pid(pid); + + init_kthread_worker(&pit->worker); + pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, + "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker_task)) { mutex_unlock(&pit->pit_state.lock); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); return NULL; } - INIT_WORK(&pit->expired, pit_do_work); + init_kthread_work(&pit->expired, pit_do_work); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -736,7 +743,7 @@ fail: kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); kvm_free_irq_source_id(kvm, pit->irq_source_id); - destroy_workqueue(pit->wq); + kthread_stop(pit->worker_task); kfree(pit); return NULL; } @@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm) mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); - cancel_work_sync(&kvm->arch.vpit->expired); + flush_kthread_work(&kvm->arch.vpit->expired); + kthread_stop(kvm->arch.vpit->worker_task); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); - destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 51a97426e791..fdf40425ea1d 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -1,6 +1,8 @@ #ifndef __I8254_H #define __I8254_H +#include + #include "iodev.h" struct kvm_kpit_channel_state { @@ -39,8 +41,9 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct workqueue_struct *wq; - struct work_struct expired; + struct kthread_worker worker; + struct task_struct *worker_task; + struct kthread_work expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 -- cgit v1.2.3 From 08d636b6d4fb80647fe8869ea1cd97b2c26a4751 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Aug 2011 09:57:10 -0400 Subject: ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine This method changes x86 to add a breakpoint to the mcount locations instead of calling stop machine. Now that iret can be handled by NMIs, we perform the following to update code: 1) Add a breakpoint to all locations that will be modified 2) Sync all cores 3) Update all locations to be either a nop or call (except breakpoint op) 4) Sync all cores 5) Remove the breakpoint with the new code. 6) Sync all cores [ Added updates that Masami suggested: Use unlikely(modifying_ftrace_code) in int3 trap to keep kprobes efficient. Don't use NOTIFY_* in ftrace handler in int3 as it is not a notifier. ] Cc: H. Peter Anvin Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 3 + arch/x86/kernel/ftrace.c | 342 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 8 +- include/linux/ftrace.h | 6 + 4 files changed, 358 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 268c783ab1c0..18d9005d9e4f 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); +extern int modifying_ftrace_code; static inline unsigned long ftrace_call_adjust(unsigned long addr) { @@ -50,6 +51,8 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; +int ftrace_int3_handler(struct pt_regs *regs); + #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c9a281f272fd..80af34739a9a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -334,6 +335,347 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } +int modifying_ftrace_code __read_mostly; + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ + if (WARN_ON_ONCE(!regs)) + return 0; + + if (!ftrace_location(regs->ip - 1)) + return 0; + + regs->ip += MCOUNT_INSN_SIZE - 1; + + return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + + return probe_kernel_write((void *)ip, val, size); +} + +static int add_break(unsigned long ip, const char *old) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + if (ftrace_write(ip, &brk, 1)) + return -EPERM; + + return 0; +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + + return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ + unsigned const char *old; + + old = ftrace_nop_replace(); + + return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_brk_on_nop(rec); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_brk_on_call(rec, ftrace_addr); + } + return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ + unsigned char ins[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + const unsigned char *nop; + unsigned long ftrace_addr; + unsigned long ip = rec->ip; + + /* If we fail the read, just give up */ + if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* If this does not have a breakpoint, we are done */ + if (ins[0] != brk) + return -1; + + nop = ftrace_nop_replace(); + + /* + * If the last 4 bytes of the instruction do not match + * a nop, then we assume that this is a call to ftrace_addr. + */ + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { + /* + * For extra paranoidism, we check if the breakpoint is on + * a call that would actually jump to the ftrace_addr. + * If not, don't touch the breakpoint, we make just create + * a disaster. + */ + ftrace_addr = (unsigned long)FTRACE_ADDR; + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) + return -EINVAL; + } + + return probe_kernel_write((void *)ip, &nop[0], 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ + /* skip breakpoint */ + ip++; + new++; + if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) + return -EPERM; + return 0; +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_update_nop(rec); + } + + return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + + return 0; +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + + if (ftrace_write(ip, new, 1)) + return -EPERM; + return 0; +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_update_record(rec, enable); + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return finish_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return finish_update_nop(rec); + } + + return 0; +} + +static void do_sync_core(void *data) +{ + sync_core(); +} + +static void run_sync(void) +{ + int enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disbled (on bootup). */ + if (enable_irqs) + local_irq_enable(); + on_each_cpu(do_sync_core, NULL, 1); + if (enable_irqs) + local_irq_disable(); +} + +static void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *report = "adding breakpoints"; + int count = 0; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_breakpoints(rec, enable); + if (ret) + goto remove_breakpoints; + count++; + } + + run_sync(); + + report = "updating code"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + report = "removing breakpoints"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = finish_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + return; + + remove_breakpoints: + ftrace_bug(ret, rec ? rec->ip : 0); + printk(KERN_WARNING "Failed on %s (%d):\n", report, count); + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + remove_breakpoint(rec); + } +} + +void arch_ftrace_update_code(int command) +{ + modifying_ftrace_code++; + + if (command & FTRACE_UPDATE_CALLS) + ftrace_replace_code(1); + else if (command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); + + modifying_ftrace_code--; +} + int __init ftrace_dyn_arch_init(void *data) { /* The return code is retured via data */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff9281f16029..92d5756d85fc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -303,8 +304,13 @@ gp_in_kernel: } /* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_DYNAMIC_FTRACE + /* ftrace must be first, everything else may cause a recursive crash */ + if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + return; +#endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 72a6cabb4d5b..0b5590330bca 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -286,6 +286,12 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void); struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter); struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); +#define for_ftrace_rec_iter(iter) \ + for (iter = ftrace_rec_iter_start(); \ + iter; \ + iter = ftrace_rec_iter_next(iter)) + + int ftrace_update_record(struct dyn_ftrace *rec, int enable); int ftrace_test_record(struct dyn_ftrace *rec, int enable); void ftrace_run_stop_machine(int command); -- cgit v1.2.3 From 4a6d70c9505fef1d8906b1d61db3de5d8ecf9454 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Apr 2012 16:31:07 -0400 Subject: ftrace/x86: Remove the complex ftrace NMI handling code As ftrace function tracing would require modifying code that could be executed in NMI context, which is not stopped with stop_machine(), ftrace had to do a complex algorithm with various stages of setup and memory barriers to make it work. With the new breakpoint method, this is no longer required. The changes to the code can be done without any problem in NMI context, as well as without stop machine altogether. Remove the complex code as it is no longer needed. Also, a lot of the notrace annotations could be removed from the NMI code as it is now safe to trace them. With the exception of do_nmi itself, which does some special work to handle running in the debug stack. The breakpoint method can cause NMIs to double nest the debug stack if it's not setup properly, and that is done in do_nmi(), thus that function must not be traced. (Note the arch sh may want to do the same) Cc: Paul Mundt Cc: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 - arch/x86/kernel/ftrace.c | 169 +---------------------------------------------- arch/x86/kernel/nmi.c | 10 +-- 3 files changed, 6 insertions(+), 174 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d14cc6b79ad..1324139612e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -40,7 +40,6 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 80af34739a9a..cf2d03ec1793 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -27,38 +27,18 @@ #include #include #include -#include - #ifdef CONFIG_DYNAMIC_FTRACE -/* - * modifying_code is set to notify NMIs that they need to use - * memory barriers when entering or exiting. But we don't want - * to burden NMIs with unnecessary memory barriers when code - * modification is not being done (which is most of the time). - * - * A mutex is already held when ftrace_arch_code_modify_prepare - * and post_process are called. No locks need to be taken here. - * - * Stop machine will make sure currently running NMIs are done - * and new NMIs will see the updated variable before we need - * to worry about NMIs doing memory barriers. - */ -static int modifying_code __read_mostly; -static DEFINE_PER_CPU(int, save_modifying_code); - int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); set_all_modules_text_rw(); - modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { - modifying_code = 0; set_all_modules_text_ro(); set_kernel_text_ro(); return 0; @@ -91,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) return calc.code; } -/* - * Modifying code must take extra care. On an SMP machine, if - * the code being modified is also being executed on another CPU - * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. - * But this does not stop NMIs from happening. We still need - * to protect against that. We separate out the modification of - * the code to take care of this. - * - * Two buffers are added: An IP buffer and a "code" buffer. - * - * 1) Put the instruction pointer into the IP buffer - * and the new code into the "code" buffer. - * 2) Wait for any running NMIs to finish and set a flag that says - * we are modifying code, it is done in an atomic operation. - * 3) Write the code - * 4) clear the flag. - * 5) Wait for any running NMIs to finish. - * - * If an NMI is executed, the first thing it does is to call - * "ftrace_nmi_enter". This will check if the flag is set to write - * and if it is, it will write what is in the IP and "code" buffers. - * - * The trick is, it does not matter if everyone is writing the same - * content to the code location. Also, if a CPU is executing code - * it is OK to write to that code location if the contents being written - * are the same as what exists. - */ - -#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ -static atomic_t nmi_running = ATOMIC_INIT(0); -static int mod_code_status; /* holds return value of text write */ -static void *mod_code_ip; /* holds the IP to write to */ -static const void *mod_code_newcode; /* holds the text to write to the IP */ - -static unsigned nmi_wait_count; -static atomic_t nmi_update_count = ATOMIC_INIT(0); - -int ftrace_arch_read_dyn_info(char *buf, int size) -{ - int r; - - r = snprintf(buf, size, "%u %u", - nmi_wait_count, - atomic_read(&nmi_update_count)); - return r; -} - -static void clear_mod_flag(void) -{ - int old = atomic_read(&nmi_running); - - for (;;) { - int new = old & ~MOD_CODE_WRITE_FLAG; - - if (old == new) - break; - - old = atomic_cmpxchg(&nmi_running, old, new); - } -} - -static void ftrace_mod_code(void) -{ - /* - * Yes, more than one CPU process can be writing to mod_code_status. - * (and the code itself) - * But if one were to fail, then they all should, and if one were - * to succeed, then they all should. - */ - mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, - MCOUNT_INSN_SIZE); - - /* if we fail, then kill any new writers */ - if (mod_code_status) - clear_mod_flag(); -} - -void ftrace_nmi_enter(void) -{ - __this_cpu_write(save_modifying_code, modifying_code); - - if (!__this_cpu_read(save_modifying_code)) - return; - - if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { - smp_rmb(); - ftrace_mod_code(); - atomic_inc(&nmi_update_count); - } - /* Must have previous changes seen before executions */ - smp_mb(); -} - -void ftrace_nmi_exit(void) -{ - if (!__this_cpu_read(save_modifying_code)) - return; - - /* Finish all executions before clearing nmi_running */ - smp_mb(); - atomic_dec(&nmi_running); -} - -static void wait_for_nmi_and_set_mod_flag(void) -{ - if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) - return; - - do { - cpu_relax(); - } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); - - nmi_wait_count++; -} - -static void wait_for_nmi(void) -{ - if (!atomic_read(&nmi_running)) - return; - - do { - cpu_relax(); - } while (atomic_read(&nmi_running)); - - nmi_wait_count++; -} - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -239,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) if (within(ip, (unsigned long)_text, (unsigned long)_etext)) ip = (unsigned long)__va(__pa(ip)); - mod_code_ip = (void *)ip; - mod_code_newcode = new_code; - - /* The buffers need to be visible before we let NMIs write them */ - smp_mb(); - - wait_for_nmi_and_set_mod_flag(); - - /* Make sure all running NMIs have finished before we write the code */ - smp_mb(); - - ftrace_mod_code(); - - /* Make sure the write happens before clearing the bit */ - smp_mb(); - - clear_mod_flag(); - wait_for_nmi(); - - return mod_code_status; + return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } static const unsigned char *ftrace_nop_replace(void) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf319165..eb1539eac393 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -84,7 +84,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic); #define nmi_to_desc(type) (&nmi_desc[type]) -static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -209,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name) EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static notrace __kprobes void +static __kprobes void pci_serr_error(unsigned char reason, struct pt_regs *regs) { pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", @@ -236,7 +236,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; @@ -263,7 +263,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) outb(reason, NMI_REASON_PORT); } -static notrace __kprobes void +static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { int handled; @@ -305,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; -- cgit v1.2.3 From 0749708352fddbe0fa81fc25f96e3b1f77c655f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 28 Apr 2012 14:27:38 -0700 Subject: x86: make word-at-a-time strncpy_from_user clear bytes at the end This makes the newly optimized x86 strncpy_from_user clear the final bytes in the word past the final NUL character, rather than copy them as the word they were in the source. NOTE! Unlike the silly semantics of the libc 'strncpy()' function, the kernel strncpy_from_user() has never cleared all of the end of the destination buffer. And neither does it do so now: it only clears the bytes at the end of the last word it copied. So why make this change at all? It doesn't really cost us anything extra (we have to calculate the mask to get the length anyway), and it means that *if* any user actually cares about zeroing the whole buffer, they can do a "memset()" before the strncpy_from_user(), and we will no longer write random bytes after the NUL character. In particular, the buffer contents will now at no point contain random source data from beyond the end of the string. In other words, it makes behavior a bit more repeatable at no new cost, so it's a small cleanup. I've been carrying this as a patch for the last few weeks or so in my tree (done at the same time the sign error was fixed in commit 12e993b89464), I might as well commit it. Signed-off-by: Linus Torvalds --- arch/x86/lib/usercopy.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index d6ae30bbd7bb..2e4e4b02c37a 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -44,13 +44,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) } EXPORT_SYMBOL_GPL(copy_from_user_nmi); -static inline unsigned long count_bytes(unsigned long mask) -{ - mask = (mask - 1) & ~mask; - mask >>= 7; - return count_masked_bytes(mask); -} - /* * Do a strncpy, return length of string without final '\0'. * 'count' is the user-supplied count (return 'count' if we @@ -69,16 +62,19 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long max = count; while (max >= sizeof(unsigned long)) { - unsigned long c; + unsigned long c, mask; /* Fall back to byte-at-a-time if we get a page fault */ if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) break; - /* This can write a few bytes past the NUL character, but that's ok */ + mask = has_zero(c); + if (mask) { + mask = (mask - 1) & ~mask; + mask >>= 7; + *(unsigned long *)(dst+res) = c & mask; + return res + count_masked_bytes(mask); + } *(unsigned long *)(dst+res) = c; - c = has_zero(c); - if (c) - return res + count_bytes(c); res += sizeof(unsigned long); max -= sizeof(unsigned long); } -- cgit v1.2.3 From 42a6bd2006c922143cee8d9ec7c4e27526d4d2a3 Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Sun, 1 Apr 2012 17:29:32 +0900 Subject: x86, relocs: Remove an unused variable sh_symtab is set but not used. Signed-off-by: Kusanagi Kouichi Signed-off-by: Jiri Kosina --- arch/x86/boot/compressed/relocs.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index d3c0b0277666..fb7117a4ade1 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -403,13 +403,11 @@ static void print_absolute_symbols(void) for (i = 0; i < ehdr.e_shnum; i++) { struct section *sec = &secs[i]; char *sym_strtab; - Elf32_Sym *sh_symtab; int j; if (sec->shdr.sh_type != SHT_SYMTAB) { continue; } - sh_symtab = sec->symtab; sym_strtab = sec->link->strtab; for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { Elf32_Sym *sym; -- cgit v1.2.3 From f227d4306cf30e1d5b6f231e8ef9006c34f3d186 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 Apr 2012 18:01:53 +0200 Subject: x86, MCE, AMD: Make APIC LVT thresholding interrupt optional Currently, the APIC LVT interrupt for error thresholding is implicitly enabled. However, there are models in the F15h range which do not enable it. Make the code machinery which sets up the APIC interrupt support an optional setting and add an ->interrupt_capable member to the bank representation mirroring that capability and enable the interrupt offset programming only if it is true. Simplify code and fixup comment style while at it. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 56 ++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 99b57179f912..2c1d178be46e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -51,6 +51,7 @@ struct threshold_block { unsigned int cpu; u32 address; u16 interrupt_enable; + bool interrupt_capable; u16 threshold_limit; struct kobject kobj; struct list_head miscj; @@ -83,6 +84,21 @@ struct thresh_restart { u16 old_limit; }; +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; @@ -104,8 +120,10 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) return 1; }; -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; @@ -128,6 +146,12 @@ static void threshold_restart_bank(void *_tr) (new_count & THRESHOLD_MAX); } + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; + if (tr->set_lvt_off) { if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { /* set new lvt offset */ @@ -136,9 +160,10 @@ static void threshold_restart_bank(void *_tr) } } - tr->b->interrupt_enable ? - (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (hi &= ~MASK_INT_TYPE_HI); + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: hi |= MASK_COUNT_EN_HI; wrmsr(tr->b->address, lo, hi); @@ -202,14 +227,17 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; - offset = setup_APIC_mce(offset, - (high & MASK_LVTOFF_HI) >> 20); - memset(&b, 0, sizeof(b)); - b.cpu = cpu; - b.bank = bank; - b.block = block; - b.address = address; + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + b.interrupt_capable = lvt_interrupt_supported(bank, high); + + if (b.interrupt_capable) { + int new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + } mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; @@ -309,6 +337,9 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; + if (!b->interrupt_capable) + return -EINVAL; + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; @@ -467,6 +498,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, b->cpu = cpu; b->address = address; b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; INIT_LIST_HEAD(&b->miscj); -- cgit v1.2.3 From d26ecc4894464318dce51d709e19dd9d88916bee Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 16 Apr 2012 18:20:36 +0200 Subject: x86, MCE, AMD: Hide interrupt_enable sysfs node Depending on whether the box supports the APIC LVT interrupt for thresholding, we want to show the 'interrupt_enable' sysfs node or not. Make that the case by adding it to the default sysfs attributes only if it is supported. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 2c1d178be46e..f4873a64f46d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -421,10 +421,10 @@ RW_ATTR(threshold_limit); RW_ATTR(error_count); static struct attribute *default_attrs[] = { - &interrupt_enable.attr, &threshold_limit.attr, &error_count.attr, - NULL + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, }; #define to_block(k) container_of(k, struct threshold_block, kobj) @@ -501,6 +501,11 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; + if (b->interrupt_capable) + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + else + threshold_ktype.default_attrs[2] = NULL; + INIT_LIST_HEAD(&b->miscj); if (per_cpu(threshold_banks, cpu)[bank]->blocks) { -- cgit v1.2.3 From 575203b4747c371698dd686b1fa6d0a3a0c47ac6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 Apr 2012 18:01:34 +0200 Subject: x86, MCE, AMD: Disable error thresholding bank 4 on some models Turn off MC4_MISC thresholding banks on models which have them but that particular processor implementation does not supply applicable error sources to be counted. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087d..888fbf9d0adf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1423,6 +1423,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ if (c->x86 == 6 && banks > 0) mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT(62)) { + val &= ~BIT(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { -- cgit v1.2.3 From baa495d9de2af97310128bfc0e365a813b63d5bb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Apr 2012 18:31:53 -0700 Subject: x86/PCI: fix memleak with get_current_resources() In pci_scan_acpi_root(), when pci_use_crs is set, get_current_resources() is used to get pci_root_info, and it will allocate name and resource array. Later if pci_create_root_bus() can not create bus (could be already there...) it will only free bus res list, but the name and res array is not freed. Let get_current_resource() take info pointer instead of using local info. Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index ed2835e148b5..a99b7d75f5ca 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -315,49 +315,55 @@ static void add_resources(struct pci_root_info *info) } } +static void free_pci_root_info(struct pci_root_info *info) +{ + kfree(info->name); + kfree(info->res); + memset(info, 0, sizeof(struct pci_root_info)); +} + static void -get_current_resources(struct acpi_device *device, int busnum, +get_current_resources(struct pci_root_info *info, + struct acpi_device *device, int busnum, int domain, struct list_head *resources) { - struct pci_root_info info; size_t size; - info.bridge = device; - info.res_num = 0; - info.resources = resources; + info->bridge = device; + info->res_num = 0; + info->resources = resources; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, - &info); - if (!info.res_num) + info); + if (!info->res_num) return; - size = sizeof(*info.res) * info.res_num; - info.res = kmalloc(size, GFP_KERNEL); - if (!info.res) + size = sizeof(*info->res) * info->res_num; + info->res = kmalloc(size, GFP_KERNEL); + if (!info->res) return; - info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); - if (!info.name) + info->name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); + if (!info->name) goto name_alloc_fail; - info.res_num = 0; + info->res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, - &info); + info); if (pci_use_crs) { - add_resources(&info); + add_resources(info); return; } - kfree(info.name); - name_alloc_fail: - kfree(info.res); + free_pci_root_info(info); } struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) { struct acpi_device *device = root->device; + struct pci_root_info info; int domain = root->segment; int busnum = root->secondary.start; LIST_HEAD(resources); @@ -402,6 +408,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) sd->domain = domain; sd->node = node; + memset(&info, 0, sizeof(struct pci_root_info)); /* * Maybe the desired pci bus has been already scanned. In such case * it is unnecessary to scan the pci bus with the given domain,busnum. @@ -415,7 +422,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); } else { - get_current_resources(device, busnum, domain, &resources); + get_current_resources(&info, device, busnum, domain, + &resources); /* * _CRS with no apertures is normal, so only fall back to @@ -429,6 +437,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) bus->subordinate = pci_scan_child_bus(bus); else pci_free_resource_list(&resources); + + if (!bus && pci_use_crs) + free_pci_root_info(&info); } /* After the PCI-E bus has been walked and all devices discovered, -- cgit v1.2.3 From 9a03d28d9490b5a04f8b1d98fc08067c6f4f6189 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Apr 2012 18:31:53 -0700 Subject: x86/PCI: refactor get_current_resources() Rename get_current_resources() to probe_pci_root_info. 1. Remove resource list head from pci_root_info 2. Make get_current_resources() not pass resources 3. Rename get_current_resources() to probe_pci_root_info() Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index a99b7d75f5ca..a858c1d9af53 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -12,7 +12,6 @@ struct pci_root_info { char *name; unsigned int res_num; struct resource *res; - struct list_head *resources; int busnum; }; @@ -287,7 +286,8 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type) } } -static void add_resources(struct pci_root_info *info) +static void add_resources(struct pci_root_info *info, + struct list_head *resources) { int i; struct resource *res, *root, *conflict; @@ -311,7 +311,7 @@ static void add_resources(struct pci_root_info *info) "ignoring host bridge window %pR (conflicts with %s %pR)\n", res, conflict->name, conflict); else - pci_add_resource(info->resources, res); + pci_add_resource(resources, res); } } @@ -323,41 +323,30 @@ static void free_pci_root_info(struct pci_root_info *info) } static void -get_current_resources(struct pci_root_info *info, - struct acpi_device *device, int busnum, - int domain, struct list_head *resources) +probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, + int busnum, int domain) { size_t size; info->bridge = device; info->res_num = 0; - info->resources = resources; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, info); if (!info->res_num) return; size = sizeof(*info->res) * info->res_num; + info->res_num = 0; info->res = kmalloc(size, GFP_KERNEL); if (!info->res) return; info->name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); if (!info->name) - goto name_alloc_fail; + return; - info->res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, info); - - if (pci_use_crs) { - add_resources(info); - - return; - } - -name_alloc_fail: - free_pci_root_info(info); } struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) @@ -422,15 +411,18 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); } else { - get_current_resources(&info, device, busnum, domain, - &resources); + probe_pci_root_info(&info, device, busnum, domain); /* * _CRS with no apertures is normal, so only fall back to * defaults or native bridge info if we're ignoring _CRS. */ - if (!pci_use_crs) + if (pci_use_crs) + add_resources(&info, &resources); + else { + free_pci_root_info(&info); x86_pci_root_bus_resources(busnum, &resources); + } bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); if (bus) -- cgit v1.2.3 From fd3b0c1ea482e863d6a2556b6686e35bec7a4f1c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Apr 2012 18:31:53 -0700 Subject: x86/PCI: add host bridge resource release for _CRS path 1. Allocate pci_root_info instead of using stack. We need to pass around info for release function. 2. Add release_pci_root_info 3. Set x86 host bridge release function to make sure root bridge related resources get freed during root bus removal. Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 63 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index a858c1d9af53..2b74a161d215 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -315,11 +315,40 @@ static void add_resources(struct pci_root_info *info, } } -static void free_pci_root_info(struct pci_root_info *info) +static void free_pci_root_info_res(struct pci_root_info *info) { kfree(info->name); kfree(info->res); - memset(info, 0, sizeof(struct pci_root_info)); + info->res = NULL; + info->res_num = 0; +} + +static void __release_pci_root_info(struct pci_root_info *info) +{ + int i; + struct resource *res; + + for (i = 0; i < info->res_num; i++) { + res = &info->res[i]; + + if (!res->parent) + continue; + + if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) + continue; + + release_resource(res); + } + + free_pci_root_info_res(info); + + kfree(info); +} +static void release_pci_root_info(struct pci_host_bridge *bridge) +{ + struct pci_root_info *info = bridge->release_data; + + __release_pci_root_info(info); } static void @@ -352,7 +381,7 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) { struct acpi_device *device = root->device; - struct pci_root_info info; + struct pci_root_info *info = NULL; int domain = root->segment; int busnum = root->secondary.start; LIST_HEAD(resources); @@ -397,7 +426,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) sd->domain = domain; sd->node = node; - memset(&info, 0, sizeof(struct pci_root_info)); + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + kfree(sd); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (out of memory)\n", domain, busnum); + return NULL; + } /* * Maybe the desired pci bus has been already scanned. In such case * it is unnecessary to scan the pci bus with the given domain,busnum. @@ -409,29 +444,33 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) * be replaced by sd. */ memcpy(bus->sysdata, sd, sizeof(*sd)); + kfree(info); kfree(sd); } else { - probe_pci_root_info(&info, device, busnum, domain); + probe_pci_root_info(info, device, busnum, domain); /* * _CRS with no apertures is normal, so only fall back to * defaults or native bridge info if we're ignoring _CRS. */ if (pci_use_crs) - add_resources(&info, &resources); + add_resources(info, &resources); else { - free_pci_root_info(&info); + free_pci_root_info_res(info); x86_pci_root_bus_resources(busnum, &resources); } + bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); - if (bus) + if (bus) { bus->subordinate = pci_scan_child_bus(bus); - else + pci_set_host_bridge_release( + to_pci_host_bridge(bus->bridge), + release_pci_root_info, info); + } else { pci_free_resource_list(&resources); - - if (!bus && pci_use_crs) - free_pci_root_info(&info); + __release_pci_root_info(info); + } } /* After the PCI-E bus has been walked and all devices discovered, -- cgit v1.2.3 From fe05725ff97530e26109a0c3d52cef7fff326e15 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Apr 2012 18:31:53 -0700 Subject: x86/PCI: embed name into pci_root_info struct We now keep the pci_root_info struct for the entire lifetime of the host bridge, so just embed the name in the struct rather than allocating it separately. Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2b74a161d215..23e7361b1747 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -9,7 +9,7 @@ struct pci_root_info { struct acpi_device *bridge; - char *name; + char name[16]; unsigned int res_num; struct resource *res; int busnum; @@ -317,7 +317,6 @@ static void add_resources(struct pci_root_info *info, static void free_pci_root_info_res(struct pci_root_info *info) { - kfree(info->name); kfree(info->res); info->res = NULL; info->res_num = 0; @@ -370,9 +369,7 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, if (!info->res) return; - info->name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); - if (!info->name) - return; + sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, info); -- cgit v1.2.3 From 35cb05e5bdac209cfdfafbe50d89ee7069cb6237 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Apr 2012 18:31:53 -0700 Subject: x86/PCI: embed pci_sysdata into pci_root_info on ACPI path Embed the x86 struct pci_sysdata in the struct pci_root_info so it will be automatically freed in the remove path. Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 23e7361b1747..8a17b23f8c84 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -13,6 +13,7 @@ struct pci_root_info { unsigned int res_num; struct resource *res; int busnum; + struct pci_sysdata sd; }; static bool pci_use_crs = true; @@ -410,26 +411,16 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) if (node != -1 && !node_online(node)) node = -1; - /* Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ - sd = kzalloc(sizeof(*sd), GFP_KERNEL); - if (!sd) { + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { printk(KERN_WARNING "pci_bus %04x:%02x: " "ignored (out of memory)\n", domain, busnum); return NULL; } + sd = &info->sd; sd->domain = domain; sd->node = node; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) { - kfree(sd); - printk(KERN_WARNING "pci_bus %04x:%02x: " - "ignored (out of memory)\n", domain, busnum); - return NULL; - } /* * Maybe the desired pci bus has been already scanned. In such case * it is unnecessary to scan the pci bus with the given domain,busnum. @@ -442,7 +433,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) */ memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(info); - kfree(sd); } else { probe_pci_root_info(info, device, busnum, domain); @@ -484,9 +474,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) } } - if (!bus) - kfree(sd); - if (bus && node != -1) { #ifdef CONFIG_ACPI_NUMA if (pxm >= 0) -- cgit v1.2.3 From d28e5ac2a07e27638cf5ac061721b7969e17fe78 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Apr 2012 18:31:54 -0700 Subject: x86/PCI: dynamically allocate pci_root_info for native host bridge drivers This dynamically allocates struct pci_root_info instead of using a static array. Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/amd_bus.c | 76 ++++++++++++++++----------------------------- arch/x86/pci/broadcom_bus.c | 12 +++---- arch/x86/pci/bus_numa.c | 69 +++++++++++++++++++++++++++------------- arch/x86/pci/bus_numa.h | 18 ++++++----- 4 files changed, 88 insertions(+), 87 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 0567df3890e1..459a7316375c 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -32,6 +32,18 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = { #define RANGE_NUM 16 +static struct pci_root_info __init *find_pci_root_info(int node, int link) +{ + struct pci_root_info *info; + + /* find the position */ + list_for_each_entry(info, &pci_root_infos, list) + if (info->node == node && info->link == link) + return info; + + return NULL; +} + /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus @@ -50,7 +62,6 @@ static int __init early_fill_mp_bus_info(void) int def_link; struct pci_root_info *info; u32 reg; - struct resource *res; u64 start; u64 end; struct range range[RANGE_NUM]; @@ -86,7 +97,6 @@ static int __init early_fill_mp_bus_info(void) if (!found) return 0; - pci_root_num = 0; for (i = 0; i < 4; i++) { int min_bus; int max_bus; @@ -105,13 +115,8 @@ static int __init early_fill_mp_bus_info(void) #endif link = (reg >> 8) & 0x03; - info = &pci_root_info[pci_root_num]; - info->bus_min = min_bus; - info->bus_max = max_bus; - info->node = node; - info->link = link; + info = alloc_pci_root_info(min_bus, max_bus, node, link); sprintf(info->name, "PCI Bus #%02x", min_bus); - pci_root_num++; } /* get the default node and link for left over res */ @@ -134,16 +139,10 @@ static int __init early_fill_mp_bus_info(void) link = (reg >> 4) & 0x03; end = (reg & 0xfff000) | 0xfff; - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) + info = find_pci_root_info(node, link); + if (!info) continue; /* not found */ - info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", node, link, start, end); @@ -155,13 +154,8 @@ static int __init early_fill_mp_bus_info(void) } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; + info = find_pci_root_info(def_node, def_link); + if (info) { for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; @@ -214,16 +208,10 @@ static int __init early_fill_mp_bus_info(void) end <<= 8; end |= 0xffff; - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) - continue; /* not found */ + info = find_pci_root_info(node, link); - info = &pci_root_info[j]; + if (!info) + continue; printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", node, link, start, end); @@ -291,14 +279,8 @@ static int __init early_fill_mp_bus_info(void) * add left over mmio range to def node/link ? * that is tricky, just record range in from start_min to 4G */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; - + info = find_pci_root_info(def_node, def_link); + if (info) { for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; @@ -309,20 +291,16 @@ static int __init early_fill_mp_bus_info(void) } } - for (i = 0; i < pci_root_num; i++) { - int res_num; + list_for_each_entry(info, &pci_root_infos, list) { int busnum; + struct pci_root_res *root_res; - info = &pci_root_info[i]; - res_num = info->res_num; busnum = info->bus_min; printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n", info->bus_min, info->bus_max, info->node, info->link); - for (j = 0; j < res_num; j++) { - res = &info->res[j]; - printk(KERN_DEBUG "bus: %02x index %x %pR\n", - busnum, j, res); - } + list_for_each_entry(root_res, &info->resources, list) + printk(KERN_DEBUG "bus: %02x %pR\n", + busnum, &root_res->res); } return 0; diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index f3a7c569a403..614392ced7d6 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -22,19 +22,15 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) { struct pci_root_info *info; + struct pci_root_res *root_res; struct resource res; u16 word1, word2; u8 fbus, lbus; - int i; - - info = &pci_root_info[pci_root_num]; - pci_root_num++; /* read the PCI bus numbers */ fbus = read_pci_config_byte(bus, slot, func, 0x44); lbus = read_pci_config_byte(bus, slot, func, 0x45); - info->bus_min = fbus; - info->bus_max = lbus; + info = alloc_pci_root_info(fbus, lbus, 0, 0); /* * Add the legacy IDE ports on bus 0 @@ -86,8 +82,8 @@ static void __init cnb20le_res(u8 bus, u8 slot, u8 func) res.flags = IORESOURCE_BUS; printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res); - for (i = 0; i < info->res_num; i++) - printk(KERN_INFO "host bridge window %pR\n", &info->res[i]); + list_for_each_entry(root_res, &info->resources, list) + printk(KERN_INFO "host bridge window %pR\n", &root_res->res); } static int __init broadcom_postcore_init(void) diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index fd3f65510e9d..306579f7d0fd 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -4,35 +4,38 @@ #include "bus_numa.h" -int pci_root_num; -struct pci_root_info pci_root_info[PCI_ROOT_NR]; +LIST_HEAD(pci_root_infos); -void x86_pci_root_bus_resources(int bus, struct list_head *resources) +static struct pci_root_info *x86_find_pci_root_info(int bus) { - int i; - int j; struct pci_root_info *info; - if (!pci_root_num) - goto default_resources; + if (list_empty(&pci_root_infos)) + return NULL; - for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == bus) - break; - } + list_for_each_entry(info, &pci_root_infos, list) + if (info->bus_min == bus) + return info; + + return NULL; +} - if (i == pci_root_num) +void x86_pci_root_bus_resources(int bus, struct list_head *resources) +{ + struct pci_root_info *info = x86_find_pci_root_info(bus); + struct pci_root_res *root_res; + + if (!info) goto default_resources; printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", bus); - info = &pci_root_info[i]; - for (j = 0; j < info->res_num; j++) { + list_for_each_entry(root_res, &info->resources, list) { struct resource *res; struct resource *root; - res = &info->res[j]; + res = &root_res->res; pci_add_resource(resources, res); if (res->flags & IORESOURCE_IO) root = &ioport_resource; @@ -53,11 +56,32 @@ default_resources: pci_add_resource(resources, &iomem_resource); } +struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max, + int node, int link) +{ + struct pci_root_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return info; + + INIT_LIST_HEAD(&info->resources); + info->bus_min = bus_min; + info->bus_max = bus_max; + info->node = node; + info->link = link; + + list_add_tail(&info->list, &pci_root_infos); + + return info; +} + void __devinit update_res(struct pci_root_info *info, resource_size_t start, resource_size_t end, unsigned long flags, int merge) { - int i; struct resource *res; + struct pci_root_res *root_res; if (start > end) return; @@ -69,11 +93,11 @@ void __devinit update_res(struct pci_root_info *info, resource_size_t start, goto addit; /* try to merge it with old one */ - for (i = 0; i < info->res_num; i++) { + list_for_each_entry(root_res, &info->resources, list) { resource_size_t final_start, final_end; resource_size_t common_start, common_end; - res = &info->res[i]; + res = &root_res->res; if (res->flags != flags) continue; @@ -93,14 +117,15 @@ void __devinit update_res(struct pci_root_info *info, resource_size_t start, addit: /* need to add that */ - if (info->res_num >= RES_NUM) + root_res = kzalloc(sizeof(*root_res), GFP_KERNEL); + if (!root_res) return; - res = &info->res[info->res_num]; + res = &root_res->res; res->name = info->name; res->flags = flags; res->start = start; res->end = end; - res->child = NULL; - info->res_num++; + + list_add_tail(&root_res->list, &info->resources); } diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h index 804a4b40c31a..226a466b2b2b 100644 --- a/arch/x86/pci/bus_numa.h +++ b/arch/x86/pci/bus_numa.h @@ -4,22 +4,24 @@ * sub bus (transparent) will use entres from 3 to store extra from * root, so need to make sure we have enough slot there. */ -#define RES_NUM 16 +struct pci_root_res { + struct list_head list; + struct resource res; +}; + struct pci_root_info { + struct list_head list; char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; + struct list_head resources; int bus_min; int bus_max; int node; int link; }; -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -extern int pci_root_num; -extern struct pci_root_info pci_root_info[PCI_ROOT_NR]; - +extern struct list_head pci_root_infos; +struct pci_root_info *alloc_pci_root_info(int bus_min, int bus_max, + int node, int link); extern void update_res(struct pci_root_info *info, resource_size_t start, resource_size_t end, unsigned long flags, int merge); #endif -- cgit v1.2.3 From c57ca65a6ea3171370cbb3010e5a3aea7399a5e1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Apr 2012 18:31:54 -0700 Subject: x86/PCI: merge pcibios_scan_root() and pci_scan_bus_on_node() pcibios_scan_root() and pci_scan_bus_on_node() were almost identical, so this patch merges them. Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/common.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 323481e06ef8..8e04ec591543 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -430,9 +430,7 @@ void __init dmi_check_pciprobe(void) struct pci_bus * __devinit pcibios_scan_root(int busnum) { - LIST_HEAD(resources); struct pci_bus *bus = NULL; - struct pci_sysdata *sd; while ((bus = pci_find_next_bus(bus)) != NULL) { if (bus->number == busnum) { @@ -441,28 +439,10 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) } } - /* Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ - sd = kzalloc(sizeof(*sd), GFP_KERNEL); - if (!sd) { - printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); - return NULL; - } - - sd->node = get_mp_bus_to_node(busnum); - - printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - x86_pci_root_bus_resources(busnum, &resources); - bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); - if (!bus) { - pci_free_resource_list(&resources); - kfree(sd); - } - - return bus; + return pci_scan_bus_on_node(busnum, &pci_root_ops, + get_mp_bus_to_node(busnum)); } + void __init pcibios_set_cache_line_size(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -656,6 +636,7 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, } sd->node = node; x86_pci_root_bus_resources(busno, &resources); + printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busno); bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources); if (!bus) { pci_free_resource_list(&resources); -- cgit v1.2.3 From 284f5f9dbac170b054c1e386ef92cbf654e91bba Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 30 Apr 2012 15:21:02 -0600 Subject: PCI: work around Stratus ftServer broken PCIe hierarchy A PCIe downstream port is a P2P bridge. Its secondary interface is a link that should lead only to device 0 (unless ARI is enabled)[1], so we don't probe for non-zero device numbers. Some Stratus ftServer systems have a PCIe downstream port (02:00.0) that leads to both an upstream port (03:00.0) and a downstream port (03:01.0), and 03:01.0 has important devices below it: [0000:02]-+-00.0-[03-3c]--+-00.0-[04-09]--... \-01.0-[0a-0d]--+-[USB] +-[NIC] +-... Previously, we didn't enumerate device 03:01.0, so USB and the network didn't work. This patch adds a DMI quirk to scan all device numbers, not just 0, below a downstream port. Based on a patch by Prarit Bhargava. [1] PCIe spec r3.0, sec 7.3.1 CC: Myron Stowe CC: Don Dutile CC: James Paradis CC: Matthew Wilcox CC: Jesse Barnes CC: Prarit Bhargava Signed-off-by: Bjorn Helgaas --- Documentation/kernel-parameters.txt | 3 +++ arch/x86/pci/common.c | 16 ++++++++++++++++ drivers/pci/pci.c | 3 +++ drivers/pci/probe.c | 8 ++++++-- include/asm-generic/pci-bridge.h | 6 ++++++ 5 files changed, 34 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c1601e5a8b71..f995195409fd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2161,6 +2161,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. on: Turn realloc on realloc same as realloc=on noari do not use PCIe ARI. + pcie_scan_all Scan all possible PCIe devices. Otherwise we + only look for one device below a PCIe downstream + port. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 323481e06ef8..16c5d7835295 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -229,6 +230,14 @@ static int __devinit assign_all_busses(const struct dmi_system_id *d) } #endif +static int __devinit set_scan_all(const struct dmi_system_id *d) +{ + printk(KERN_INFO "PCI: %s detected, enabling pci=pcie_scan_all\n", + d->ident); + pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS); + return 0; +} + static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { #ifdef __i386__ /* @@ -420,6 +429,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"), }, }, + { + .callback = set_scan_all, + .ident = "Stratus/NEC ftServer", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ftServer"), + }, + }, {} }; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 111569ccab43..8e6c38817036 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "pci.h" @@ -3893,6 +3894,8 @@ static int __init pci_setup(char *str) pcie_bus_config = PCIE_BUS_PERFORMANCE; } else if (!strncmp(str, "pcie_bus_peer2peer", 18)) { pcie_bus_config = PCIE_BUS_PEER2PEER; + } else if (!strncmp(str, "pcie_scan_all", 13)) { + pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS); } else { printk(KERN_ERR "PCI: Unknown option `%s'\n", str); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 5e1ca3c58a7d..2dc8675eea1a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "pci.h" #define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */ @@ -1395,10 +1396,13 @@ static unsigned no_next_fn(struct pci_dev *dev, unsigned fn) static int only_one_child(struct pci_bus *bus) { struct pci_dev *parent = bus->self; + if (!parent || !pci_is_pcie(parent)) return 0; - if (parent->pcie_type == PCI_EXP_TYPE_ROOT_PORT || - parent->pcie_type == PCI_EXP_TYPE_DOWNSTREAM) + if (parent->pcie_type == PCI_EXP_TYPE_ROOT_PORT) + return 1; + if (parent->pcie_type == PCI_EXP_TYPE_DOWNSTREAM && + !pci_has_flag(PCI_SCAN_ALL_PCIE_DEVS)) return 1; return 0; } diff --git a/include/asm-generic/pci-bridge.h b/include/asm-generic/pci-bridge.h index a5b5d5a89a4f..20db2e5a0a69 100644 --- a/include/asm-generic/pci-bridge.h +++ b/include/asm-generic/pci-bridge.h @@ -30,6 +30,12 @@ enum { PCI_ENABLE_PROC_DOMAINS = 0x00000010, /* ... except for domain 0 */ PCI_COMPAT_DOMAIN_0 = 0x00000020, + + /* PCIe downstream ports are bridges that normally lead to only a + * device 0, but if this is set, we scan all possible devices, not + * just device 0. + */ + PCI_SCAN_ALL_PCIE_DEVS = 0x00000040, }; #ifdef CONFIG_PCI -- cgit v1.2.3 From 4a8e2a3115e7aa4bd2deb4c6483d47c743e0fbb3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 28 Mar 2012 12:37:36 -0400 Subject: x86/apic: Replace io_apic_ops with x86_io_apic_ops. Which makes the code fit within the rest of the x86_ops functions. Acked-by: Suresh Siddha [v1: Changed x86_apic -> x86_ioapic per Yinghai Lu suggestion] [v2: Rebased on tip/x86/urgent and redid to match Ingo's syntax style] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/io_apic.h | 35 ++++++++++++++++++++----------- arch/x86/include/asm/x86_init.h | 9 +++++++- arch/x86/kernel/apic/io_apic.c | 46 ++++------------------------------------- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/x86_init.c | 8 +++++++ 5 files changed, 44 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2c4943de5150..73d8c5398ea9 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -5,7 +5,7 @@ #include #include #include - +#include /* * Intel IO-APIC support for SMP and UP systems. * @@ -21,15 +21,6 @@ #define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) #define IO_APIC_REDIR_MASKED (1 << 16) -struct io_apic_ops { - void (*init) (void); - unsigned int (*read) (unsigned int apic, unsigned int reg); - void (*write) (unsigned int apic, unsigned int reg, unsigned int value); - void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); -}; - -void __init set_io_apic_ops(const struct io_apic_ops *); - /* * The structure of the IO-APIC: */ @@ -156,7 +147,6 @@ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); void setup_IO_APIC_irq_extra(u32 gsi); -extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); @@ -185,12 +175,29 @@ extern void mp_save_irq(struct mpc_intsrc *m); extern void disable_ioapic_support(void); +extern void __init native_io_apic_init_mappings(void); +extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg); +extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val); +extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + return x86_io_apic_ops.read(apic, reg); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + x86_io_apic_ops.write(apic, reg, value); +} +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + x86_io_apic_ops.modify(apic, reg, value); +} #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; -static inline void ioapic_and_gsi_init(void) { } static inline void ioapic_insert_resources(void) { } #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } @@ -212,6 +219,10 @@ static inline int restore_ioapic_entries(void) static inline void mp_save_irq(struct mpc_intsrc *m) { }; static inline void disable_ioapic_support(void) { } +#define native_io_apic_init_mappings NULL +#define native_io_apic_read NULL +#define native_io_apic_write NULL +#define native_io_apic_modify NULL #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 764b66a4cf89..c090af10ac7d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -188,11 +188,18 @@ struct x86_msi_ops { void (*restore_msi_irqs)(struct pci_dev *dev, int irq); }; +struct x86_io_apic_ops { + void (*init) (void); + unsigned int (*read) (unsigned int apic, unsigned int reg); + void (*write) (unsigned int apic, unsigned int reg, unsigned int value); + void (*modify)(unsigned int apic, unsigned int reg, unsigned int value); +}; + extern struct x86_init_ops x86_init; extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; extern struct x86_msi_ops x86_msi; - +extern struct x86_io_apic_ops x86_io_apic_ops; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e88300d8e80a..973539c128a4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -68,24 +68,6 @@ #define for_each_irq_pin(entry, head) \ for (entry = head; entry; entry = entry->next) -static void __init __ioapic_init_mappings(void); - -static unsigned int __io_apic_read (unsigned int apic, unsigned int reg); -static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val); -static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val); - -static struct io_apic_ops io_apic_ops = { - .init = __ioapic_init_mappings, - .read = __io_apic_read, - .write = __io_apic_write, - .modify = __io_apic_modify, -}; - -void __init set_io_apic_ops(const struct io_apic_ops *ops) -{ - io_apic_ops = *ops; -} - /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes @@ -313,21 +295,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg) irq_free_desc(at); } -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - return io_apic_ops.read(apic, reg); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - io_apic_ops.write(apic, reg, value); -} - -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - io_apic_ops.modify(apic, reg, value); -} - struct io_apic { unsigned int index; @@ -349,14 +316,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector) writel(vector, &io_apic->eoi); } -static unsigned int __io_apic_read(unsigned int apic, unsigned int reg) +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); writel(reg, &io_apic->index); return readl(&io_apic->data); } -static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -370,7 +337,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va * * Older SiS APIC requires we rewrite the index register */ -static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); @@ -3931,12 +3898,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_and_gsi_init(void) -{ - io_apic_ops.init(); -} - -static void __init __ioapic_init_mappings(void) +void __init native_io_apic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..8526317c5f0b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1012,7 +1012,7 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - ioapic_and_gsi_init(); + x86_io_apic_ops.init(); kvm_guest_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9cf71d0b2d37..35c5e543f550 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -119,3 +120,10 @@ struct x86_msi_ops x86_msi = { .teardown_msi_irqs = default_teardown_msi_irqs, .restore_msi_irqs = default_restore_msi_irqs, }; + +struct x86_io_apic_ops x86_io_apic_ops = { + .init = native_io_apic_init_mappings, + .read = native_io_apic_read, + .write = native_io_apic_write, + .modify = native_io_apic_modify, +}; -- cgit v1.2.3 From 31b3c9d723407b395564d1fff3624cc0083ae520 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 20 Mar 2012 18:53:10 -0400 Subject: xen/x86: Implement x86_apic_ops Or rather just implement one different function as opposed to the native one : the read function. We synthesize the values. Acked-by: Suresh Siddha [v1: Rebased on top of tip/x86/urgent] [v2: Return 0xfd instead of 0xff in the default case] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/apic.c | 17 +++++++++++++++++ arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/xen-ops.h | 4 ++++ 4 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 arch/x86/xen/apic.c (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index add2c2d729ce..96ab2c09cb68 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -20,5 +20,5 @@ obj-$(CONFIG_EVENT_TRACING) += trace.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_DOM0) += vga.o +obj-$(CONFIG_XEN_DOM0) += apic.o vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c new file mode 100644 index 000000000000..73ade38caa32 --- /dev/null +++ b/arch/x86/xen/apic.c @@ -0,0 +1,17 @@ +#include +#include + +unsigned int xen_io_apic_read(unsigned apic, unsigned reg) +{ + if (reg == 0x1) + return 0x00170020; + else if (reg == 0x0) + return apic << 24; + + return 0xfd; +} + +void __init xen_init_apic(void) +{ + x86_io_apic_ops.read = xen_io_apic_read; +} diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a8f8844b8d32..c2ea9e9f420d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1362,6 +1362,8 @@ asmlinkage void __init xen_start_kernel(void) xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; + xen_init_apic(); + /* Make sure ACS will be enabled */ pci_request_acs(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b095739ccd4c..45c0c0667bd9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -92,11 +92,15 @@ struct dom0_vga_console_info; #ifdef CONFIG_XEN_DOM0 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +void __init xen_init_apic(void); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) { } +static inline void __init xen_init_apic(void) +{ +} #endif /* Declare an asm function, along with symbols needed to make it -- cgit v1.2.3 From 27abd14bd9f1117dc7bdeee81a2de0557e077b61 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 16 Apr 2012 13:53:40 -0400 Subject: Revert "xen/x86: Workaround 'x86/ioapic: Add register level checks to detect bogus io-apic entries'" This reverts commit 2531d64b6fe2724dc432b67d8dc66bd45621da0b. The two patches: x86/apic: Replace io_apic_ops with x86_io_apic_ops. xen/x86: Implement x86_apic_ops take care of fixing it properly. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b8e279479a6b..988828b479ed 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1859,7 +1859,6 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, #endif /* CONFIG_X86_64 */ static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; -static unsigned char fake_ioapic_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) { @@ -1900,7 +1899,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) * We just don't map the IO APIC - all access is via * hypercalls. Keep the address in the pte for reference. */ - pte = pfn_pte(PFN_DOWN(__pa(fake_ioapic_mapping)), PAGE_KERNEL); + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); break; #endif @@ -2065,7 +2064,6 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); - memset(fake_ioapic_mapping, 0xfd, PAGE_SIZE); } /* Protected by xen_reservation_lock. */ -- cgit v1.2.3 From ab6ec39a191243b9968bb9ac7f26cc7ec30c618b Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Tue, 1 May 2012 00:16:27 +0800 Subject: xen/apic: implement io apic read with hypercall Implements xen_io_apic_read with hypercall, so it returns proper IO-APIC information instead of fabricated one. Fallback to return an emulated IO_APIC values if hypercall fails. [v2: fallback to return an emulated IO_APIC values if hypercall fails] Signed-off-by: Lin Ming Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/apic.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 73ade38caa32..1913bf2d2a9c 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -1,8 +1,21 @@ #include #include +#include +#include +#include unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mpc_ioapic_addr(apic); + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (!ret) + return apic_op.value; + + /* fallback to return an emulated IO_APIC values */ if (reg == 0x1) return 0x00170020; else if (reg == 0x0) -- cgit v1.2.3 From 0f1103e40f9186bd2cdac4dde6c5bbd2f5273365 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 1 May 2012 17:25:18 -0600 Subject: x86/PCI: fix unused variable warning in amd_bus.c Fix this warning: arch/x86/pci/amd_bus.c: In function 'early_fill_mp_bus_info': arch/x86/pci/amd_bus.c:56:6: warning: unused variable 'j' [-Wunused-variable] introduced by commit d28e5ac2a07e ("x86/PCI: dynamically allocate pci_root_info for native host bridge drivers"). Reported-by: Stephen Rothwell Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- arch/x86/pci/amd_bus.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 459a7316375c..5aed49bff058 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -44,6 +44,15 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link) return NULL; } +static void __init set_mp_bus_range_to_node(int min_bus, int max_bus, int node) +{ +#ifdef CONFIG_NUMA + int j; + + for (j = min_bus; j <= max_bus; j++) + set_mp_bus_to_node(j, node); +#endif +} /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus @@ -53,7 +62,6 @@ static struct pci_root_info __init *find_pci_root_info(int node, int link) static int __init early_fill_mp_bus_info(void) { int i; - int j; unsigned bus; unsigned slot; int node; @@ -109,10 +117,7 @@ static int __init early_fill_mp_bus_info(void) min_bus = (reg >> 16) & 0xff; max_bus = (reg >> 24) & 0xff; node = (reg >> 4) & 0x07; -#ifdef CONFIG_NUMA - for (j = min_bus; j <= max_bus; j++) - set_mp_bus_to_node(j, node); -#endif + set_mp_bus_range_to_node(min_bus, max_bus, node); link = (reg >> 8) & 0x03; info = alloc_pci_root_info(min_bus, max_bus, node, link); -- cgit v1.2.3 From 078de5f706ece36afd73bb4b8283314132d2dfdf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 8 Feb 2012 07:00:08 -0800 Subject: userns: Store uid and gid values in struct cred with kuid_t and kgid_t types cred.h and a few trivial users of struct cred are changed. The rest of the users of struct cred are left for other patches as there are too many changes to make in one go and leave the change reviewable. If the user namespace is disabled and CONFIG_UIDGID_STRICT_TYPE_CHECKS are disabled the code will contiue to compile and behave correctly. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- arch/x86/mm/fault.c | 2 +- fs/ioprio.c | 8 ++------ include/linux/cred.h | 16 ++++++++-------- include/linux/user_namespace.h | 8 ++++---- kernel/cred.c | 36 ++++++++++++++++++++++-------------- kernel/signal.c | 14 ++++++++------ kernel/sys.c | 26 +++++++++----------------- kernel/user_namespace.c | 4 ++-- mm/oom_kill.c | 4 ++-- security/commoncap.c | 3 +-- 10 files changed, 59 insertions(+), 62 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3ecfd1aaf214..76dcd9d8e0bc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -582,7 +582,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, pte_t *pte = lookup_address(address, &level); if (pte && pte_present(*pte) && !pte_exec(*pte)) - printk(nx_warning, current_uid()); + printk(nx_warning, from_kuid(&init_user_ns, current_uid())); } printk(KERN_ALERT "BUG: unable to handle kernel "); diff --git a/fs/ioprio.c b/fs/ioprio.c index 8e35e964d9ed..2072e41785d2 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -123,9 +123,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - const struct cred *tcred = __task_cred(p); - kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); - if (!uid_eq(tcred_uid, uid)) + if (!uid_eq(task_uid(p), uid)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -220,9 +218,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - const struct cred *tcred = __task_cred(p); - kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); - if (!uid_eq(tcred_uid, user->uid)) + if (!uid_eq(task_uid(p), user->uid)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/include/linux/cred.h b/include/linux/cred.h index 0ab3cda4a774..fac0579258fc 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -123,14 +123,14 @@ struct cred { #define CRED_MAGIC 0x43736564 #define CRED_MAGIC_DEAD 0x44656144 #endif - uid_t uid; /* real UID of the task */ - gid_t gid; /* real GID of the task */ - uid_t suid; /* saved UID of the task */ - gid_t sgid; /* saved GID of the task */ - uid_t euid; /* effective UID of the task */ - gid_t egid; /* effective GID of the task */ - uid_t fsuid; /* UID for VFS ops */ - gid_t fsgid; /* GID for VFS ops */ + kuid_t uid; /* real UID of the task */ + kgid_t gid; /* real GID of the task */ + kuid_t suid; /* saved UID of the task */ + kgid_t sgid; /* saved GID of the task */ + kuid_t euid; /* effective UID of the task */ + kgid_t egid; /* effective GID of the task */ + kuid_t fsuid; /* UID for VFS ops */ + kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4c9846d90741..a2c61457cba1 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -70,15 +70,15 @@ static inline void put_user_ns(struct user_namespace *ns) #endif static inline uid_t user_ns_map_uid(struct user_namespace *to, - const struct cred *cred, uid_t uid) + const struct cred *cred, kuid_t uid) { - return from_kuid_munged(to, make_kuid(cred->user_ns, uid)); + return from_kuid_munged(to, uid); } static inline gid_t user_ns_map_gid(struct user_namespace *to, - const struct cred *cred, gid_t gid) + const struct cred *cred, kgid_t gid) { - return from_kgid_munged(to, make_kgid(cred->user_ns, gid)); + return from_kgid_munged(to, gid); } #endif /* _LINUX_USER_H */ diff --git a/kernel/cred.c b/kernel/cred.c index 7a0d80669886..eddc5e2e9587 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -49,6 +49,14 @@ struct cred init_cred = { .subscribers = ATOMIC_INIT(2), .magic = CRED_MAGIC, #endif + .uid = GLOBAL_ROOT_UID, + .gid = GLOBAL_ROOT_GID, + .suid = GLOBAL_ROOT_UID, + .sgid = GLOBAL_ROOT_GID, + .euid = GLOBAL_ROOT_UID, + .egid = GLOBAL_ROOT_GID, + .fsuid = GLOBAL_ROOT_UID, + .fsgid = GLOBAL_ROOT_GID, .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, @@ -488,10 +496,10 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ - if (old->euid != new->euid || - old->egid != new->egid || - old->fsuid != new->fsuid || - old->fsgid != new->fsgid || + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || !cap_issubset(new->cap_permitted, old->cap_permitted)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); @@ -500,9 +508,9 @@ int commit_creds(struct cred *new) } /* alter the thread keyring */ - if (new->fsuid != old->fsuid) + if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(task); - if (new->fsgid != old->fsgid) + if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(task); /* do it @@ -519,16 +527,16 @@ int commit_creds(struct cred *new) alter_cred_subscribers(old, -2); /* send notifications */ - if (new->uid != old->uid || - new->euid != old->euid || - new->suid != old->suid || - new->fsuid != old->fsuid) + if (!uid_eq(new->uid, old->uid) || + !uid_eq(new->euid, old->euid) || + !uid_eq(new->suid, old->suid) || + !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); - if (new->gid != old->gid || - new->egid != old->egid || - new->sgid != old->sgid || - new->fsgid != old->fsgid) + if (!gid_eq(new->gid, old->gid) || + !gid_eq(new->egid, old->egid) || + !gid_eq(new->sgid, old->sgid) || + !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ diff --git a/kernel/signal.c b/kernel/signal.c index e2c5d84f2dac..2734dc965f69 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1038,8 +1038,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str if (SI_FROMKERNEL(info)) return; - info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), - current_cred(), info->si_uid); + rcu_read_lock(); + info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), + make_kuid(current_user_ns(), info->si_uid)); + rcu_read_unlock(); } #else static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) @@ -1106,7 +1108,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); - q->info.si_uid = current_uid(); + q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); break; case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; @@ -1973,7 +1975,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ ptrace_stop(exit_code, why, 1, &info); @@ -2828,7 +2830,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) info.si_errno = 0; info.si_code = SI_USER; info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); return kill_something_info(sig, &info, pid); } @@ -2871,7 +2873,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) info.si_errno = 0; info.si_code = SI_TKILL; info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); return do_send_specific(tgid, pid, sig, &info); } diff --git a/kernel/sys.c b/kernel/sys.c index f0c43b4b6657..39962818c008 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -175,7 +175,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; - kuid_t cred_uid; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) @@ -209,22 +208,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - cred_uid = make_kuid(cred->user_ns, cred->uid); uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) - uid = cred_uid; - else if (!uid_eq(uid, cred_uid) && + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - const struct cred *tcred = __task_cred(p); - kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); - if (uid_eq(tcred_uid, uid)) + if (uid_eq(task_uid(p), uid)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); - if (!uid_eq(uid, cred_uid)) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } @@ -248,7 +244,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; - kuid_t cred_uid; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) @@ -280,25 +275,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - cred_uid = make_kuid(cred->user_ns, cred->uid); uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) - uid = cred_uid; - else if (!uid_eq(uid, cred_uid) && + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - const struct cred *tcred = __task_cred(p); - kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); - if (uid_eq(tcred_uid, uid)) { + if (uid_eq(task_uid(p), uid)) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } } while_each_thread(g, p); - if (!uid_eq(uid, cred_uid)) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } @@ -641,7 +633,7 @@ static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(make_kuid(new->user_ns, new->uid)); + new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 7eff867bfac5..86602316422d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -36,8 +36,8 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; - kuid_t owner = make_kuid(new->user_ns, new->euid); - kgid_t group = make_kgid(new->user_ns, new->egid); + kuid_t owner = new->euid; + kgid_t group = new->egid; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 46bf2ed5594c..9f09a1fde9f9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -410,8 +410,8 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas } pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", - task->pid, task_uid(task), task->tgid, - task->mm->total_vm, get_mm_rss(task->mm), + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), task_cpu(task), task->signal->oom_adj, task->signal->oom_score_adj, task->comm); task_unlock(task); diff --git a/security/commoncap.c b/security/commoncap.c index f2399d8afbe0..dbd465a59286 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -77,8 +77,7 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, { for (;;) { /* The owner of the user namespace has all caps. */ - if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner, - make_kuid(cred->user_ns, cred->euid))) + if (targ_ns != &init_user_ns && uid_eq(targ_ns->owner, cred->euid)) return 0; /* Do we have the necessary capabilities? */ -- cgit v1.2.3 From 59a094c994a138049b41a44bc29cff9407d51c5b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 May 2012 09:26:16 -0400 Subject: ftrace/x86: Use asm/kprobes.h instead of linux/kprobes.h If CONFIG_KPROBES is not set, then linux/kprobes.h will not include asm/kprobes.h needed by x86/ftrace.c for the BREAKPOINT macro. The x86/ftrace.c file should just include asm/kprobes.h as it does not need the rest of kprobes. Reported-by: Ingo Molnar Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cf2d03ec1793..4243e8bbdcb1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -20,11 +20,11 @@ #include #include #include -#include #include #include +#include #include #include -- cgit v1.2.3 From 45046892ef89c1e0caad66a03c8c1e14ad478d23 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 May 2012 09:03:01 +0000 Subject: x86: Use generic init_task Same code. Use the generic version. The special Makefile treatment is pointless anyway as init_task.o contains only data which is handled by the linker script. So no point on being treated like head text. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120503085035.739963562@linutronix.de Cc: x86@kernel.org --- arch/x86/Kconfig | 1 + arch/x86/Makefile | 1 - arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/init_task.c | 42 ------------------------------------------ arch/x86/kernel/process.c | 9 +++++++++ 5 files changed, 11 insertions(+), 44 deletions(-) delete mode 100644 arch/x86/kernel/init_task.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 046bf4bd2510..224695938400 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -83,6 +83,7 @@ config X86 select GENERIC_IOMAP select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC select GENERIC_SMP_IDLE_THREAD + select HAVE_GENERIC_INIT_TASK config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237606a3..3e48b26f67d5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -146,7 +146,6 @@ archheaders: head-y := arch/x86/kernel/head_$(BITS).o head-y += arch/x86/kernel/head$(BITS).o head-y += arch/x86/kernel/head.o -head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 532d2e090e6f..56ebd1f98447 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds +extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c deleted file mode 100644 index 43e9ccf44947..000000000000 --- a/arch/x86/kernel/init_task.c +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); - -/* - * Initial thread structure. - * - * We need to make sure that this is THREAD_SIZE aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); -EXPORT_SYMBOL(init_task); - -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data..cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d92a5ab6e8b..8aa532fa015d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -27,6 +27,15 @@ #include #include +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; + #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); static ATOMIC_NOTIFIER_HEAD(idle_notifier); -- cgit v1.2.3 From a6359d1eec43d1fd6ffbac958149844873e0084f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 May 2012 09:03:02 +0000 Subject: init_task: Replace CONFIG_HAVE_GENERIC_INIT_TASK Now that all archs except ia64 are converted, replace the config and let the ia64 select CONFIG_ARCH_INIT_TASK Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120503085035.867948914@linutronix.de --- arch/Kconfig | 3 ++- arch/alpha/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/avr32/Kconfig | 1 - arch/blackfin/Kconfig | 1 - arch/c6x/Kconfig | 1 - arch/cris/Kconfig | 1 - arch/frv/Kconfig | 1 - arch/h8300/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 + arch/m32r/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/mn10300/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/score/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/tile/Kconfig | 1 - arch/um/Kconfig.common | 1 - arch/unicore32/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - init/Makefile | 5 ++++- 29 files changed, 7 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 2dd8fdd7ea9f..597b132b3902 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -148,7 +148,8 @@ config USE_GENERIC_SMP_HELPERS config GENERIC_SMP_IDLE_THREAD bool -config HAVE_GENERIC_INIT_TASK +# Select if arch init_task initializer is different to init/init_task.c +config ARCH_INIT_TASK bool config HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 74d000480b69..991b8bbff4ff 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -16,7 +16,6 @@ config ALPHA select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8b365353a10d..cb253ce218a0 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -35,7 +35,6 @@ config ARM select GENERIC_PCI_IOMAP select HAVE_BPF_JIT if NET select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK help The ARM series is a line of low-power-consumption RISC chip designs licensed by ARM Ltd and targeted at embedded applications and diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index f4289ca78b55..3dea7231f637 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -12,7 +12,6 @@ config AVR32 select HARDIRQS_SW_RESEND select GENERIC_IRQ_SHOW select ARCH_HAVE_NMI_SAFE_CMPXCHG - select HAVE_GENERIC_INIT_TASK help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 8570d6e21807..779b9c846fd7 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -38,7 +38,6 @@ config BLACKFIN select IRQ_PER_CPU if SMP select HAVE_NMI_WATCHDOG if NMI_WATCHDOG select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK config GENERIC_CSUM def_bool y diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 4189fb52d519..1c3ccd416d50 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -10,7 +10,6 @@ config TMS320C6X select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG select HAVE_GENERIC_HARDIRQS - select HAVE_GENERIC_INIT_TASK select HAVE_MEMBLOCK select SPARSE_IRQ select IRQ_DOMAIN diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 15e30a771a72..2995035812ec 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -50,7 +50,6 @@ config CRIS select GENERIC_IRQ_SHOW select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32 - select HAVE_GENERIC_INIT_TASK config HZ int diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index ed6dbd290c42..a685910d2d5c 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -9,7 +9,6 @@ config FRV select GENERIC_IRQ_SHOW select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES - select HAVE_GENERIC_INIT_TASK config ZONE_DMA bool diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 5fac425aece4..56e890df5053 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -5,7 +5,6 @@ config H8300 select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES - select HAVE_GENERIC_INIT_TASK config SYMBOL_PREFIX string diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 6ee5488ed305..d2e4a3330336 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -28,7 +28,6 @@ config HEXAGON select NO_IOPORT select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK # mostly generic routines, with some accelerated ones ---help--- Qualcomm Hexagon is a processor architecture designed for high diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 11975475516a..022ea3a9d1ab 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -34,6 +34,7 @@ config IA64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD + select ARCH_INIT_TASK default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 8b8bd7fa148a..ef80a6546ff2 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -11,7 +11,6 @@ config M32R select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_ATOMIC64 - select HAVE_GENERIC_INIT_TASK config SBUS bool diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 1891127c7db0..d318c606c888 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -8,7 +8,6 @@ config M68K select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS select GENERIC_CPU_DEVICES select FPU if MMU - select HAVE_GENERIC_INIT_TASK config RWSEM_GENERIC_SPINLOCK bool diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 21ccba6a05f9..ac22dc7f4cab 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -22,7 +22,6 @@ config MICROBLAZE select GENERIC_PCI_IOMAP select GENERIC_CPU_DEVICES select GENERIC_ATOMIC64 - select HAVE_GENERIC_INIT_TASK config SWAP def_bool n diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index d6c78901e5f2..186fc8cf9ee0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -30,7 +30,6 @@ config MIPS select HAVE_MEMBLOCK_NODE_MAP select ARCH_DISCARD_MEMBLOCK select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK menu "Machine selection" diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index d28b6eb1b122..3aa3de017159 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -6,7 +6,6 @@ config MN10300 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_KGDB select HAVE_NMI_WATCHDOG if MN10300_WD_TIMER - select HAVE_GENERIC_INIT_TASK config AM33_2 def_bool n diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 6d921936f4ab..a4787197d8fe 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -17,7 +17,6 @@ config OPENRISC select GENERIC_IOMAP select GENERIC_CPU_DEVICES select GENERIC_ATOMIC64 - select HAVE_GENERIC_INIT_TASK config MMU def_bool y diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 4c6ca0de90cc..ddb8b24b823d 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -18,7 +18,6 @@ config PARISC select IRQ_PER_CPU select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 946e8816ecd3..c81553508366 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -145,7 +145,6 @@ config PPC select HAVE_ARCH_JUMP_LABEL select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK config EARLY_PRINTK bool diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 6c0eb214ab27..15cab3ee44e8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -123,7 +123,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK config SCHED_OMIT_FRAME_POINTER def_bool y diff --git a/arch/score/Kconfig b/arch/score/Kconfig index c760bccfad40..4b285779ac05 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -9,7 +9,6 @@ config SCORE select HAVE_MEMBLOCK_NODE_MAP select ARCH_DISCARD_MEMBLOCK select GENERIC_CPU_DEVICES - select HAVE_GENERIC_INIT_TASK choice prompt "System type" diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a0cd70be8656..244cfd0dbb7b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -29,7 +29,6 @@ config SUPERH select GENERIC_ATOMIC64 select GENERIC_IRQ_SHOW select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 99aad7cd0075..e417f35d5912 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -31,7 +31,6 @@ config SPARC select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK config SPARC32 def_bool !64BIT diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 4fa3ff5a7bc3..96033e2d6845 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -13,7 +13,6 @@ config TILE select GENERIC_IRQ_SHOW select SYS_HYPERVISOR select ARCH_HAVE_NMI_SAFE_CMPXCHG - select HAVE_GENERIC_INIT_TASK # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index f03473cf86df..20a49ba93cb9 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -10,7 +10,6 @@ config UML select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select GENERIC_IO - select HAVE_GENERIC_INIT_TASK config MMU bool diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index e24ca398120e..eeb8054c7cd8 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -13,7 +13,6 @@ config UNICORE32 select GENERIC_IRQ_SHOW select ARCH_WANT_FRAME_POINTERS select GENERIC_IOMAP - select HAVE_GENERIC_INIT_TASK help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 224695938400..046bf4bd2510 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -83,7 +83,6 @@ config X86 select GENERIC_IOMAP select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC select GENERIC_SMP_IDLE_THREAD - select HAVE_GENERIC_INIT_TASK config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index d0ab5bb0d582..8a3f8351f438 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -10,7 +10,6 @@ config XTENSA select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES - select HAVE_GENERIC_INIT_TASK help Xtensa processors are 32-bit RISC machines designed by Tensilica primarily for embedded systems. These processors are both diff --git a/init/Makefile b/init/Makefile index c55eac955cdc..7bc47ee31c36 100644 --- a/init/Makefile +++ b/init/Makefile @@ -9,7 +9,10 @@ else obj-$(CONFIG_BLK_DEV_INITRD) += initramfs.o endif obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o -obj-$(CONFIG_HAVE_GENERIC_INIT_TASK) += init_task.o + +ifneq ($(CONFIG_ARCH_INIT_TASK),y) +obj-y += init_task.o +endif mounts-y := do_mounts.o mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o -- cgit v1.2.3 From 57c22e5f35aa4b9b2fe11f73f3e62bbf9ef36190 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 2 May 2012 17:55:56 +0300 Subject: KVM: fix cpuid eax for KVM leaf cpuid eax should return the max leaf so that guests can find out the valid range. This matches Xen et al. Update documentation to match. Tested with -cpu host. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/cpuid.txt | 6 +++++- arch/x86/kvm/cpuid.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 882068538c9c..83afe65d4966 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -10,11 +10,15 @@ a guest. KVM cpuid functions are: function: KVM_CPUID_SIGNATURE (0x40000000) -returns : eax = 0, +returns : eax = 0x40000001, ebx = 0x4b4d564b, ecx = 0x564b4d56, edx = 0x4d. Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM". +The value in eax corresponds to the maximum cpuid function present in this leaf, +and will be updated if more functions are added in the future. +Note also that old hosts set eax value to 0x0. This should +be interpreted as if the value was 0x40000001. This function queries the presence of KVM cpuid leafs. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c2134b881033..7df1c6d839fb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -398,7 +398,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case KVM_CPUID_SIGNATURE: { char signature[12] = "KVMKVMKVM\0\0"; u32 *sigptr = (u32 *)signature; - entry->eax = 0; + entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; entry->edx = sigptr[2]; -- cgit v1.2.3 From 9b72d3b07dd99ac8ab2b84de5004a295af460536 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 30 Apr 2012 14:45:49 +0300 Subject: KVM guest: make kvm_para_available() check hypervisor bit reading cpuid leaf This cpuid range does not exist on real HW and Intel spec says that "Information returned for highest basic information leaf" will be returned. Not very well defined. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 99c4bbe0cca2..a7a7a94b94ce 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -178,14 +178,16 @@ static inline int kvm_para_available(void) unsigned int eax, ebx, ecx, edx; char signature[13]; - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; - - if (strcmp(signature, "KVMKVMKVM") == 0) - return 1; + if (cpu_has_hypervisor) { + cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); + memcpy(signature + 0, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + + if (strcmp(signature, "KVMKVMKVM") == 0) + return 1; + } return 0; } -- cgit v1.2.3 From 1c2545be05f436523cabc54087c6a60ea10110d3 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 30 Apr 2012 17:46:31 +0900 Subject: KVM: x86 emulator: Move ModRM flags for groups to top level opcode tables Needed for the following patch which simplifies ModRM fetching code. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 111 +++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0d151e232480..8d2c3d04cfec 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3359,8 +3359,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) .check_perm = (_p) } #define N D(0) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } -#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } +#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } @@ -3380,25 +3380,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) static struct opcode group7_rm1[] = { - DI(SrcNone | ModRM | Priv, monitor), - DI(SrcNone | ModRM | Priv, mwait), + DI(SrcNone | Priv, monitor), + DI(SrcNone | Priv, mwait), N, N, N, N, N, N, }; static struct opcode group7_rm3[] = { - DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), - II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), - DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), - DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), - DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), + DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa), + II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall), + DIP(SrcNone | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | Prot | Priv, stgi, check_svme), + DIP(SrcNone | Prot | Priv, clgi, check_svme), + DIP(SrcNone | Prot | Priv, skinit, check_svme), + DIP(SrcNone | Prot | Priv, invlpga, check_svme), }; static struct opcode group7_rm7[] = { N, - DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + DIP(SrcNone, rdtscp, check_rdtsc), N, N, N, N, N, N, }; @@ -3414,76 +3414,77 @@ static struct opcode group1[] = { }; static struct opcode group1A[] = { - I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; static struct opcode group3[] = { - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcImm | ModRM, em_test), - I(DstMem | SrcNone | ModRM | Lock, em_not), - I(DstMem | SrcNone | ModRM | Lock, em_neg), - I(SrcMem | ModRM, em_mul_ex), - I(SrcMem | ModRM, em_imul_ex), - I(SrcMem | ModRM, em_div_ex), - I(SrcMem | ModRM, em_idiv_ex), + I(DstMem | SrcImm, em_test), + I(DstMem | SrcImm, em_test), + I(DstMem | SrcNone | Lock, em_not), + I(DstMem | SrcNone | Lock, em_neg), + I(SrcMem, em_mul_ex), + I(SrcMem, em_imul_ex), + I(SrcMem, em_div_ex), + I(SrcMem, em_idiv_ex), }; static struct opcode group4[] = { - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), - I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | Lock, em_grp45), N, N, N, N, N, N, }; static struct opcode group5[] = { - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(DstMem | SrcNone | ModRM | Lock, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - I(SrcMem | ModRM | Stack, em_grp45), - I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), - I(SrcMem | ModRM | Stack, em_grp45), N, + I(DstMem | SrcNone | Lock, em_grp45), + I(DstMem | SrcNone | Lock, em_grp45), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), + I(SrcMem | Stack, em_grp45), + I(SrcMemFAddr | ImplicitOps, em_grp45), + I(SrcMem | Stack, em_grp45), N, }; static struct opcode group6[] = { - DI(ModRM | Prot, sldt), - DI(ModRM | Prot, str), - DI(ModRM | Prot | Priv, lldt), - DI(ModRM | Prot | Priv, ltr), + DI(Prot, sldt), + DI(Prot, str), + DI(Prot | Priv, lldt), + DI(Prot | Priv, ltr), N, N, N, N, }; static struct group_dual group7 = { { - DI(ModRM | Mov | DstMem | Priv, sgdt), - DI(ModRM | Mov | DstMem | Priv, sidt), - II(ModRM | SrcMem | Priv, em_lgdt, lgdt), - II(ModRM | SrcMem | Priv, em_lidt, lidt), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), - II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), + DI(Mov | DstMem | Priv, sgdt), + DI(Mov | DstMem | Priv, sidt), + II(SrcMem | Priv, em_lgdt, lgdt), + II(SrcMem | Priv, em_lidt, lidt), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + I(SrcNone | Priv | VendorSpecific, em_vmcall), EXT(0, group7_rm1), N, EXT(0, group7_rm3), - II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, - II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), + II(SrcNone | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | Mov | Priv, em_lmsw, lmsw), + EXT(0, group7_rm7), } }; static struct opcode group8[] = { N, N, N, N, - I(DstMem | SrcImmByte | ModRM, em_bt), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), - I(DstMem | SrcImmByte | ModRM | Lock, em_btr), - I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), + I(DstMem | SrcImmByte, em_bt), + I(DstMem | SrcImmByte | Lock | PageTable, em_bts), + I(DstMem | SrcImmByte | Lock, em_btr), + I(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; static struct group_dual group9 = { { - N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, + N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), + I(DstMem | SrcImm | Mov | PageTable, em_mov), X7(D(Undefined)), }; @@ -3541,10 +3542,10 @@ static struct opcode opcode_table[256] = { /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ - G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), - G(DstMem | SrcImm | ModRM | Group, group1), - G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), - G(DstMem | SrcImmByte | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm, group1), + G(DstMem | SrcImm, group1), + G(ByteOp | DstMem | SrcImm | No64, group1), + G(DstMem | SrcImmByte, group1), I2bv(DstMem | SrcReg | ModRM, em_test), I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ -- cgit v1.2.3 From 9f4260e73ac43aaa91eb5de95950e1de7002f467 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 30 Apr 2012 17:48:25 +0900 Subject: KVM: x86 emulator: Avoid pushing back ModRM byte fetched for group decoding Although ModRM byte is fetched for group decoding, it is soon pushed back to make decode_modrm() fetch it later again. Now that ModRM flag can be found in the top level opcode tables, fetch ModRM byte before group decoding to make the code simpler. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8d2c3d04cfec..7fd25763b0e0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -972,7 +972,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -3976,17 +3975,16 @@ done_prefixes: } ctxt->d = opcode.flags; + if (ctxt->d & ModRM) + ctxt->modrm = insn_fetch(u8, ctxt); + while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, ctxt); - --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) opcode = opcode.u.gdual->mod3[goffset]; -- cgit v1.2.3 From 8529f613b6945f4b5bd8c1b69e42aa1cc51b2eb6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 6 May 2012 18:02:40 -0700 Subject: vfs: don't force a big memset of stat data just to clear padding fields Admittedly this is something that the compiler should be able to just do for us, but gcc just isn't that smart. And trying to use a structure initializer (which would get us the right semantics) ends up resulting in gcc allocating stack space for _two_ 'struct stat', and then copying one into the other. So do it by hand - just have a per-architecture macro that initializes the padding fields. And if the architecture doesn't provide one, fall back to the old behavior of just doing the whole memset() first. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/stat.h | 21 +++++++++++++++++++++ fs/stat.c | 12 ++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h index e0b1d9bbcbc6..7b3ddc348585 100644 --- a/arch/x86/include/asm/stat.h +++ b/arch/x86/include/asm/stat.h @@ -25,6 +25,12 @@ struct stat { unsigned long __unused5; }; +/* We don't need to memset the whole thing just to initialize the padding */ +#define INIT_STRUCT_STAT_PADDING(st) do { \ + st.__unused4 = 0; \ + st.__unused5 = 0; \ +} while (0) + #define STAT64_HAS_BROKEN_ST_INO 1 /* This matches struct stat64 in glibc2.1, hence the absolutely @@ -63,6 +69,12 @@ struct stat64 { unsigned long long st_ino; }; +/* We don't need to memset the whole thing just to initialize the padding */ +#define INIT_STRUCT_STAT64_PADDING(st) do { \ + memset(&st.__pad0, 0, sizeof(st.__pad0)); \ + memset(&st.__pad3, 0, sizeof(st.__pad3)); \ +} while (0) + #else /* __i386__ */ struct stat { @@ -87,6 +99,15 @@ struct stat { unsigned long st_ctime_nsec; long __unused[3]; }; + +/* We don't need to memset the whole thing just to initialize the padding */ +#define INIT_STRUCT_STAT_PADDING(st) do { \ + st.__pad0 = 0; \ + st.__unused[0] = 0; \ + st.__unused[1] = 0; \ + st.__unused[2] = 0; \ +} while (0) + #endif /* for 32bit emulation and 32 bit kernels */ diff --git a/fs/stat.c b/fs/stat.c index 2b5d55eb9d9a..b30ac60291e2 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -199,6 +199,10 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat #define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) +#ifndef INIT_STRUCT_STAT_PADDING +# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) +#endif + static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) { struct stat tmp; @@ -210,7 +214,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) return -EOVERFLOW; #endif - memset(&tmp, 0, sizeof(tmp)); + INIT_STRUCT_STAT_PADDING(tmp); tmp.st_dev = encode_dev(stat->dev); tmp.st_ino = stat->ino; if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) @@ -323,11 +327,15 @@ SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, /* ---------- LFS-64 ----------- */ #ifdef __ARCH_WANT_STAT64 +#ifndef INIT_STRUCT_STAT64_PADDING +# define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st)) +#endif + static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) { struct stat64 tmp; - memset(&tmp, 0, sizeof(struct stat64)); + INIT_STRUCT_STAT64_PADDING(tmp); #ifdef CONFIG_MIPS /* mips has weird padding, so we don't get 64 bits there */ if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev)) -- cgit v1.2.3 From 6ff968cca1dfebd4b6fcade87c11658dbfc96932 Mon Sep 17 00:00:00 2001 From: Betty Dall Date: Fri, 27 Apr 2012 14:40:55 -0600 Subject: x86/nmi: Fix the type of the nmiaction.flags field This patch changes the type of the struct nmiaction flags field to unsigned long from unsigned int. All the usages of the flags field are unsigned long already. There is only one flag used currently, NMI_FLAG_FIRST, but having the wrong size could cause a truncation bug in the future on 64 bit architectures. Signed-off-by: Betty Dall Acked-by: Don Zickus Link: http://lkml.kernel.org/r/1335559255-13454-1-git-send-email-betty.dall@hp.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index a1a836c8131c..0e3793b821ef 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -40,7 +40,7 @@ typedef int (*nmi_handler_t)(unsigned int, struct pt_regs *); struct nmiaction { struct list_head list; nmi_handler_t handler; - unsigned int flags; + unsigned long flags; const char *name; }; -- cgit v1.2.3 From 736baef4472d00574089f295bc759ac002b9558c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2012 11:47:00 -0700 Subject: iommu/vt-d: Make intr-remapping initialization generic This patch introduces irq_remap_ops to hold implementation specific function pointer to handle interrupt remapping. As the first part the initialization functions for VT-d are converted to these ops. Signed-off-by: Joerg Roedel Acked-by: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Suresh Siddha Signed-off-by: Joerg Roedel --- arch/ia64/include/asm/intr_remapping.h | 4 ++ arch/x86/include/asm/intr_remapping.h | 45 ++++++++++++++++++++ arch/x86/kernel/apic/apic.c | 14 ++++--- arch/x86/kernel/apic/io_apic.c | 1 + drivers/iommu/Makefile | 2 +- drivers/iommu/dmar.c | 1 + drivers/iommu/intel-iommu.c | 1 + drivers/iommu/intel_intr_remapping.c | 52 ++++++----------------- drivers/iommu/intr_remapping.c | 76 ++++++++++++++++++++++++++++++++++ drivers/iommu/intr_remapping.h | 46 ++++++++++++++++++++ include/linux/dmar.h | 3 -- 11 files changed, 196 insertions(+), 49 deletions(-) create mode 100644 arch/ia64/include/asm/intr_remapping.h create mode 100644 arch/x86/include/asm/intr_remapping.h create mode 100644 drivers/iommu/intr_remapping.c create mode 100644 drivers/iommu/intr_remapping.h (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/intr_remapping.h b/arch/ia64/include/asm/intr_remapping.h new file mode 100644 index 000000000000..095aa0d46c58 --- /dev/null +++ b/arch/ia64/include/asm/intr_remapping.h @@ -0,0 +1,4 @@ +#ifndef __IA64_INTR_REMAPPING_H +#define __IA64_INTR_REMAPPING_H +#define intr_remapping_enabled 0 +#endif diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h new file mode 100644 index 000000000000..207c605dbdf5 --- /dev/null +++ b/arch/x86/include/asm/intr_remapping.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2012 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This header file contains the interface of the interrupt remapping code to + * the x86 interrupt management code. + */ + +#ifndef __X86_INTR_REMAPPING_H +#define __X86_INTR_REMAPPING_H + +#ifdef CONFIG_IRQ_REMAP + +extern int intr_remapping_enabled; + +extern void setup_intr_remapping(void); +extern int intr_remapping_supported(void); +extern int intr_hardware_init(void); +extern int intr_hardware_enable(void); + +#else /* CONFIG_IRQ_REMAP */ + +#define intr_remapping_enabled 0 + +static inline void setup_intr_remapping(void) { } +static inline int intr_remapping_supported(void) { return 0; } +static inline int intr_hardware_init(void) { return -ENODEV; } +static inline int intr_hardware_enable(void) { return -ENODEV; } + +#endif /* CONFIG_IRQ_REMAP */ + +#endif /* __X86_INTR_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index edc24480469f..1db6f63a22ff 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -1528,7 +1529,7 @@ int __init enable_IR(void) return -1; } - return enable_intr_remapping(); + return intr_hardware_enable(); #endif return -1; } @@ -1537,10 +1538,13 @@ void __init enable_IR_x2apic(void) { unsigned long flags; int ret, x2apic_enabled = 0; - int dmar_table_init_ret; + int hardware_init_ret; - dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret && !x2apic_supported()) + /* Make sure irq_remap_ops are initialized */ + setup_intr_remapping(); + + hardware_init_ret = intr_hardware_init(); + if (hardware_init_ret && !x2apic_supported()) return; ret = save_ioapic_entries(); @@ -1556,7 +1560,7 @@ void __init enable_IR_x2apic(void) if (x2apic_preenabled && nox2apic) disable_x2apic(); - if (dmar_table_init_ret) + if (hardware_init_ret) ret = -1; else ret = enable_IR(); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e88300d8e80a..1151fdccaad6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 1533ebf1d68e..823e1cf8708f 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o -obj-$(CONFIG_IRQ_REMAP) += intel_intr_remapping.o +obj-$(CONFIG_IRQ_REMAP) += intel_intr_remapping.o intr_remapping.o obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o obj-$(CONFIG_OMAP_IOVMM) += omap-iovmm.o obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 35c1e17fce1d..647e366403dc 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #define PREFIX "DMAR: " diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f93d5ac8f81c..e1439808192c 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index 212fff0c24b5..9c742fb111b6 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -11,8 +11,11 @@ #include #include #include +#include #include +#include "intr_remapping.h" + struct ioapic_scope { struct intel_iommu *iommu; unsigned int id; @@ -32,42 +35,6 @@ struct hpet_scope { static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; static struct hpet_scope ir_hpet[MAX_HPET_TBS]; static int ir_ioapic_num, ir_hpet_num; -int intr_remapping_enabled; - -static int disable_intremap; -static int disable_sourceid_checking; -static int no_x2apic_optout; - -static __init int setup_nointremap(char *str) -{ - disable_intremap = 1; - return 0; -} -early_param("nointremap", setup_nointremap); - -static __init int setup_intremap(char *str) -{ - if (!str) - return -EINVAL; - - while (*str) { - if (!strncmp(str, "on", 2)) - disable_intremap = 0; - else if (!strncmp(str, "off", 3)) - disable_intremap = 1; - else if (!strncmp(str, "nosid", 5)) - disable_sourceid_checking = 1; - else if (!strncmp(str, "no_x2apic_optout", 16)) - no_x2apic_optout = 1; - - str += strcspn(str, ","); - while (*str == ',') - str++; - } - - return 0; -} -early_param("intremap", setup_intremap); static DEFINE_RAW_SPINLOCK(irq_2_ir_lock); @@ -465,7 +432,7 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) } -static int setup_intr_remapping(struct intel_iommu *iommu, int mode) +static int intel_setup_intr_remapping(struct intel_iommu *iommu, int mode) { struct ir_table *ir_table; struct page *pages; @@ -534,7 +501,7 @@ static int __init dmar_x2apic_optout(void) return dmar->flags & DMAR_X2APIC_OPT_OUT; } -int __init intr_remapping_supported(void) +static int __init intel_intr_remapping_supported(void) { struct dmar_drhd_unit *drhd; @@ -554,7 +521,7 @@ int __init intr_remapping_supported(void) return 1; } -int __init enable_intr_remapping(void) +static int __init intel_enable_intr_remapping(void) { struct dmar_drhd_unit *drhd; int setup = 0; @@ -638,7 +605,7 @@ int __init enable_intr_remapping(void) if (!ecap_ir_support(iommu->ecap)) continue; - if (setup_intr_remapping(iommu, eim)) + if (intel_setup_intr_remapping(iommu, eim)) goto error; setup = 1; @@ -847,3 +814,8 @@ error: return -1; } +struct irq_remap_ops intel_irq_remap_ops = { + .supported = intel_intr_remapping_supported, + .hardware_init = dmar_table_init, + .hardware_enable = intel_enable_intr_remapping, +}; diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c new file mode 100644 index 000000000000..670c69a80afd --- /dev/null +++ b/drivers/iommu/intr_remapping.c @@ -0,0 +1,76 @@ +#include +#include +#include + +#include "intr_remapping.h" + +int intr_remapping_enabled; + +int disable_intremap; +int disable_sourceid_checking; +int no_x2apic_optout; + +static struct irq_remap_ops *remap_ops; + +static __init int setup_nointremap(char *str) +{ + disable_intremap = 1; + return 0; +} +early_param("nointremap", setup_nointremap); + +static __init int setup_intremap(char *str) +{ + if (!str) + return -EINVAL; + + while (*str) { + if (!strncmp(str, "on", 2)) + disable_intremap = 0; + else if (!strncmp(str, "off", 3)) + disable_intremap = 1; + else if (!strncmp(str, "nosid", 5)) + disable_sourceid_checking = 1; + else if (!strncmp(str, "no_x2apic_optout", 16)) + no_x2apic_optout = 1; + + str += strcspn(str, ","); + while (*str == ',') + str++; + } + + return 0; +} +early_param("intremap", setup_intremap); + +void __init setup_intr_remapping(void) +{ + remap_ops = &intel_irq_remap_ops; +} + +int intr_remapping_supported(void) +{ + if (disable_intremap) + return 0; + + if (!remap_ops || !remap_ops->supported) + return 0; + + return remap_ops->supported(); +} + +int __init intr_hardware_init(void) +{ + if (!remap_ops || !remap_ops->hardware_init) + return -ENODEV; + + return remap_ops->hardware_init(); +} + +int __init intr_hardware_enable(void) +{ + if (!remap_ops || !remap_ops->hardware_enable) + return -ENODEV; + + return remap_ops->hardware_enable(); +} diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h new file mode 100644 index 000000000000..d6df732e001f --- /dev/null +++ b/drivers/iommu/intr_remapping.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2012 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This header file contains stuff that is shared between different interrupt + * remapping drivers but with no need to be visible outside of the IOMMU layer. + */ + +#ifndef __INTR_REMAPPING_H +#define __INTR_REMAPPING_H + +#ifdef CONFIG_IRQ_REMAP + +extern int disable_intremap; +extern int disable_sourceid_checking; +extern int no_x2apic_optout; + +struct irq_remap_ops { + /* Check whether Interrupt Remapping is supported */ + int (*supported)(void); + + /* Initializes hardware and makes it ready for remapping interrupts */ + int (*hardware_init)(void); + + /* Enables the remapping hardware */ + int (*hardware_enable)(void); +}; + +extern struct irq_remap_ops intel_irq_remap_ops; + +#endif /* CONFIG_IRQ_REMAP */ + +#endif /* __INTR_REMAPPING_H */ diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 731a60975101..6d66c9c76e0a 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -115,9 +115,6 @@ struct irte { }; #ifdef CONFIG_IRQ_REMAP -extern int intr_remapping_enabled; -extern int intr_remapping_supported(void); -extern int enable_intr_remapping(void); extern void disable_intr_remapping(void); extern int reenable_intr_remapping(int); -- cgit v1.2.3 From 4f3d8b67ad3090f9fb72f8235d21cde53cd24b79 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2012 11:47:01 -0700 Subject: iommu/vt-d: Convert missing apic.c intr-remapping call to remap_ops Convert these calls too: * Disable of remapping hardware * Reenable of remapping hardware * Enable fault handling With that all of arch/x86/kernel/apic/apic.c is converted to use the generic intr-remapping interface. Signed-off-by: Joerg Roedel Acked-by: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Suresh Siddha Signed-off-by: Joerg Roedel --- arch/x86/include/asm/intr_remapping.h | 6 ++++++ arch/x86/kernel/apic/apic.c | 6 +++--- drivers/iommu/intel_intr_remapping.c | 7 +++++-- drivers/iommu/intr_remapping.c | 24 ++++++++++++++++++++++++ drivers/iommu/intr_remapping.h | 9 +++++++++ include/linux/dmar.h | 18 ------------------ 6 files changed, 47 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h index 207c605dbdf5..55aa892a53e3 100644 --- a/arch/x86/include/asm/intr_remapping.h +++ b/arch/x86/include/asm/intr_remapping.h @@ -30,6 +30,9 @@ extern void setup_intr_remapping(void); extern int intr_remapping_supported(void); extern int intr_hardware_init(void); extern int intr_hardware_enable(void); +extern void intr_hardware_disable(void); +extern int intr_hardware_reenable(int); +extern int intr_enable_fault_handling(void); #else /* CONFIG_IRQ_REMAP */ @@ -39,6 +42,9 @@ static inline void setup_intr_remapping(void) { } static inline int intr_remapping_supported(void) { return 0; } static inline int intr_hardware_init(void) { return -ENODEV; } static inline int intr_hardware_enable(void) { return -ENODEV; } +static inline void intr_hardware_disable(void) { } +static inline int intr_hardware_reenable(int eim) { return -ENODEV; } +static inline int intr_enable_fault_handling(void) { return -ENODEV; } #endif /* CONFIG_IRQ_REMAP */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1db6f63a22ff..a2762687e2ee 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1443,7 +1443,7 @@ void __init bsp_end_local_APIC_setup(void) * handling for interrupt remapping. */ if (intr_remapping_enabled) - enable_drhd_fault_handling(); + intr_enable_fault_handling(); } @@ -2181,7 +2181,7 @@ static int lapic_suspend(void) disable_local_APIC(); if (intr_remapping_enabled) - disable_intr_remapping(); + intr_hardware_disable(); local_irq_restore(flags); return 0; @@ -2250,7 +2250,7 @@ static void lapic_resume(void) apic_read(APIC_ESR); if (intr_remapping_enabled) - reenable_intr_remapping(x2apic_mode); + intr_hardware_reenable(x2apic_mode); local_irq_restore(flags); } diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index 9c742fb111b6..610b75b66c07 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -764,7 +764,7 @@ int __init ir_dev_scope_init(void) } rootfs_initcall(ir_dev_scope_init); -void disable_intr_remapping(void) +static void disable_intr_remapping(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -780,7 +780,7 @@ void disable_intr_remapping(void) } } -int reenable_intr_remapping(int eim) +static int reenable_intr_remapping(int eim) { struct dmar_drhd_unit *drhd; int setup = 0; @@ -818,4 +818,7 @@ struct irq_remap_ops intel_irq_remap_ops = { .supported = intel_intr_remapping_supported, .hardware_init = dmar_table_init, .hardware_enable = intel_enable_intr_remapping, + .hardware_disable = disable_intr_remapping, + .hardware_reenable = reenable_intr_remapping, + .enable_faulting = enable_drhd_fault_handling, }; diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c index 670c69a80afd..9aabed7c0320 100644 --- a/drivers/iommu/intr_remapping.c +++ b/drivers/iommu/intr_remapping.c @@ -74,3 +74,27 @@ int __init intr_hardware_enable(void) return remap_ops->hardware_enable(); } + +void intr_hardware_disable(void) +{ + if (!remap_ops || !remap_ops->hardware_disable) + return; + + remap_ops->hardware_disable(); +} + +int intr_hardware_reenable(int mode) +{ + if (!remap_ops || !remap_ops->hardware_reenable) + return 0; + + return remap_ops->hardware_reenable(mode); +} + +int __init intr_enable_fault_handling(void) +{ + if (!remap_ops || !remap_ops->enable_faulting) + return -ENODEV; + + return remap_ops->enable_faulting(); +} diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h index d6df732e001f..2744c9ae4aec 100644 --- a/drivers/iommu/intr_remapping.h +++ b/drivers/iommu/intr_remapping.h @@ -37,6 +37,15 @@ struct irq_remap_ops { /* Enables the remapping hardware */ int (*hardware_enable)(void); + + /* Disables the remapping hardware */ + void (*hardware_disable)(void); + + /* Reenables the remapping hardware */ + int (*hardware_reenable)(int); + + /* Enable fault handling */ + int (*enable_faulting)(void); }; extern struct irq_remap_ops intel_irq_remap_ops; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 6d66c9c76e0a..f2bd87f52a8d 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -115,9 +115,6 @@ struct irte { }; #ifdef CONFIG_IRQ_REMAP -extern void disable_intr_remapping(void); -extern int reenable_intr_remapping(int); - extern int get_irte(int irq, struct irte *entry); extern int modify_irte(int irq, struct irte *irte_modified); extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count); @@ -179,21 +176,6 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev) return 0; } -#define intr_remapping_enabled (0) - -static inline int enable_intr_remapping(void) -{ - return -1; -} - -static inline void disable_intr_remapping(void) -{ -} - -static inline int reenable_intr_remapping(int eim) -{ - return 0; -} #endif enum { -- cgit v1.2.3 From 0c3f173a88c4ae3e4253427cf574a59ad5352918 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2012 11:47:02 -0700 Subject: iommu/vt-d: Convert IR ioapic-setup to use remap_ops The IOAPIC setup routine for interrupt remapping is VT-d specific. Move it to the irq_remap_ops and add a call helper function. Signed-off-by: Joerg Roedel Acked-by: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Suresh Siddha Signed-off-by: Joerg Roedel --- arch/x86/include/asm/intr_remapping.h | 15 +++++- arch/x86/kernel/apic/io_apic.c | 68 +------------------------- drivers/iommu/intel_intr_remapping.c | 89 +++++++++++++++++++++++++++++++++++ drivers/iommu/intr_remapping.c | 12 +++++ drivers/iommu/intr_remapping.h | 8 ++++ 5 files changed, 125 insertions(+), 67 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h index 55aa892a53e3..a22e1f1ac7ec 100644 --- a/arch/x86/include/asm/intr_remapping.h +++ b/arch/x86/include/asm/intr_remapping.h @@ -24,6 +24,9 @@ #ifdef CONFIG_IRQ_REMAP +struct IO_APIC_route_entry; +struct io_apic_irq_attr; + extern int intr_remapping_enabled; extern void setup_intr_remapping(void); @@ -33,6 +36,10 @@ extern int intr_hardware_enable(void); extern void intr_hardware_disable(void); extern int intr_hardware_reenable(int); extern int intr_enable_fault_handling(void); +extern int intr_setup_ioapic_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr); #else /* CONFIG_IRQ_REMAP */ @@ -45,7 +52,13 @@ static inline int intr_hardware_enable(void) { return -ENODEV; } static inline void intr_hardware_disable(void) { } static inline int intr_hardware_reenable(int eim) { return -ENODEV; } static inline int intr_enable_fault_handling(void) { return -ENODEV; } - +static inline int intr_setup_ioapic_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + return -ENODEV; +} #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_INTR_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1151fdccaad6..e1ab625fb9ca 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1362,77 +1362,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, fasteoi ? "fasteoi" : "edge"); } - -static int setup_ir_ioapic_entry(int irq, - struct IR_IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - int index; - struct irte irte; - int ioapic_id = mpc_ioapic_id(attr->ioapic); - struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); - - if (!iommu) { - pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); - return -ENODEV; - } - - index = alloc_irte(iommu, irq, 1); - if (index < 0) { - pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); - return -ENOMEM; - } - - prepare_irte(&irte, vector, destination); - - /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, ioapic_id); - - modify_irte(irq, &irte); - - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " - "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " - "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " - "Avail:%X Vector:%02X Dest:%08X " - "SID:%04X SQ:%X SVT:%X)\n", - attr->ioapic, irte.present, irte.fpd, irte.dst_mode, - irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, - irte.avail, irte.vector, irte.dest_id, - irte.sid, irte.sq, irte.svt); - - memset(entry, 0, sizeof(*entry)); - - entry->index2 = (index >> 15) & 0x1; - entry->zero = 0; - entry->format = 1; - entry->index = (index & 0x7fff); - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - entry->vector = attr->ioapic_pin; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) { if (intr_remapping_enabled) - return setup_ir_ioapic_entry(irq, - (struct IR_IO_APIC_route_entry *)entry, - destination, vector, attr); + return intr_setup_ioapic_entry(irq, entry, destination, + vector, attr); memset(entry, 0, sizeof(*entry)); diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index 610b75b66c07..f495eba4b6ab 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -31,6 +31,7 @@ struct hpet_scope { }; #define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) +#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; static struct hpet_scope ir_hpet[MAX_HPET_TBS]; @@ -814,6 +815,93 @@ error: return -1; } +static void prepare_irte(struct irte *irte, int vector, + unsigned int dest) +{ + memset(irte, 0, sizeof(*irte)); + + irte->present = 1; + irte->dst_mode = apic->irq_dest_mode; + /* + * Trigger mode in the IRTE will always be edge, and for IO-APIC, the + * actual level or edge trigger will be setup in the IO-APIC + * RTE. This will help simplify level triggered irq migration. + * For more details, see the comments (in io_apic.c) explainig IO-APIC + * irq migration in the presence of interrupt-remapping. + */ + irte->trigger_mode = 0; + irte->dlvry_mode = apic->irq_delivery_mode; + irte->vector = vector; + irte->dest_id = IRTE_DEST(dest); + irte->redir_hint = 1; +} + +static int intel_setup_ioapic_entry(int irq, + struct IO_APIC_route_entry *route_entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + int ioapic_id = mpc_ioapic_id(attr->ioapic); + struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); + struct IR_IO_APIC_route_entry *entry; + struct irte irte; + int index; + + if (!iommu) { + pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); + return -ENODEV; + } + + entry = (struct IR_IO_APIC_route_entry *)route_entry; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) { + pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); + return -ENOMEM; + } + + prepare_irte(&irte, vector, destination); + + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, ioapic_id); + + modify_irte(irq, &irte); + + apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " + "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " + "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " + "Avail:%X Vector:%02X Dest:%08X " + "SID:%04X SQ:%X SVT:%X)\n", + attr->ioapic, irte.present, irte.fpd, irte.dst_mode, + irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, + irte.avail, irte.vector, irte.dest_id, + irte.sid, irte.sq, irte.svt); + + memset(entry, 0, sizeof(*entry)); + + entry->index2 = (index >> 15) & 0x1; + entry->zero = 0; + entry->format = 1; + entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + entry->vector = attr->ioapic_pin; + entry->mask = 0; /* enable IRQ */ + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (attr->trigger) + entry->mask = 1; + + return 0; +} + + struct irq_remap_ops intel_irq_remap_ops = { .supported = intel_intr_remapping_supported, .hardware_init = dmar_table_init, @@ -821,4 +909,5 @@ struct irq_remap_ops intel_irq_remap_ops = { .hardware_disable = disable_intr_remapping, .hardware_reenable = reenable_intr_remapping, .enable_faulting = enable_drhd_fault_handling, + .setup_ioapic_entry = intel_setup_ioapic_entry, }; diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c index 9aabed7c0320..739148ab2538 100644 --- a/drivers/iommu/intr_remapping.c +++ b/drivers/iommu/intr_remapping.c @@ -98,3 +98,15 @@ int __init intr_enable_fault_handling(void) return remap_ops->enable_faulting(); } + +int intr_setup_ioapic_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + if (!remap_ops || !remap_ops->setup_ioapic_entry) + return -ENODEV; + + return remap_ops->setup_ioapic_entry(irq, entry, destination, + vector, attr); +} diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h index 2744c9ae4aec..e8994f2b3bbe 100644 --- a/drivers/iommu/intr_remapping.h +++ b/drivers/iommu/intr_remapping.h @@ -24,6 +24,9 @@ #ifdef CONFIG_IRQ_REMAP +struct IO_APIC_route_entry; +struct io_apic_irq_attr; + extern int disable_intremap; extern int disable_sourceid_checking; extern int no_x2apic_optout; @@ -46,6 +49,11 @@ struct irq_remap_ops { /* Enable fault handling */ int (*enable_faulting)(void); + + /* IO-APIC setup routine */ + int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *, + unsigned int, int, + struct io_apic_irq_attr *); }; extern struct irq_remap_ops intel_irq_remap_ops; -- cgit v1.2.3 From 4c1bad6a0af1e297c8d05365e65af89d8c7bf9d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2012 11:47:03 -0700 Subject: iommu/vt-d: Convert IR set_affinity function to remap_ops The function to set interrupt affinity with interrupt remapping enabled is Intel specific too. So move it to the irq_remap_ops too. Signed-off-by: Joerg Roedel Acked-by: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Suresh Siddha Signed-off-by: Joerg Roedel --- arch/x86/include/asm/intr_remapping.h | 9 +++++ arch/x86/kernel/apic/io_apic.c | 69 +---------------------------------- drivers/iommu/intel_intr_remapping.c | 54 +++++++++++++++++++++++++++ drivers/iommu/intr_remapping.c | 9 +++++ drivers/iommu/intr_remapping.h | 6 +++ 5 files changed, 80 insertions(+), 67 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h index a22e1f1ac7ec..ae933ecfd8f0 100644 --- a/arch/x86/include/asm/intr_remapping.h +++ b/arch/x86/include/asm/intr_remapping.h @@ -40,6 +40,9 @@ extern int intr_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr); +extern int intr_set_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force); #else /* CONFIG_IRQ_REMAP */ @@ -59,6 +62,12 @@ static inline int intr_setup_ioapic_entry(int irq, { return -ENODEV; } +static inline int intr_set_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force) +{ + return 0; +} #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_INTR_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e1ab625fb9ca..a97c79aa25cf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2327,71 +2327,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -#ifdef CONFIG_IRQ_REMAP - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For both level and edge triggered, irq migration is a simple atomic - * update(of vector and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we eliminate the io-apic RTE modification (with the - * updated vector information), by using a virtual vector (io-apic pin number). - * Real vector that is used for interrupting cpu will be coming from - * the interrupt-remapping table entry. - * - * As the migration is a simple atomic update of IRTE, the same mechanism - * is used to migrate MSI irq's in the presence of interrupt-remapping. - */ -static int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest, irq = data->irq; - struct irte irte; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - if (get_irte(irq, &irte)) - return -EBUSY; - - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; - - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Atomically updates the IRTE with the new destination, vector - * and flushes the interrupt entry cache. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - cpumask_copy(data->affinity, mask); - return 0; -} - -#else -static inline int -ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - return 0; -} -#endif - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2636,7 +2571,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_eoi = ir_ack_apic_level; #ifdef CONFIG_SMP - chip->irq_set_affinity = ir_ioapic_set_affinity; + chip->irq_set_affinity = intr_set_affinity; #endif } #endif /* CONFIG_IRQ_REMAP */ @@ -3826,7 +3761,7 @@ void __init setup_ioapic_dest(void) mask = apic->target_cpus(); if (intr_remapping_enabled) - ir_ioapic_set_affinity(idata, mask, false); + intr_set_affinity(idata, mask, false); else ioapic_set_affinity(idata, mask, false); } diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index f495eba4b6ab..25372c1f3c8c 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -901,6 +901,59 @@ static int intel_setup_ioapic_entry(int irq, return 0; } +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. + * + * As the migration is a simple atomic update of IRTE, the same mechanism + * is used to migrate MSI irq's in the presence of interrupt-remapping. + */ +static int +intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; + struct irte irte; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + if (get_irte(irq, &irte)) + return -EBUSY; + + if (assign_irq_vector(irq, cfg, mask)) + return -EBUSY; + + dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + cpumask_copy(data->affinity, mask); + return 0; +} struct irq_remap_ops intel_irq_remap_ops = { .supported = intel_intr_remapping_supported, @@ -910,4 +963,5 @@ struct irq_remap_ops intel_irq_remap_ops = { .hardware_reenable = reenable_intr_remapping, .enable_faulting = enable_drhd_fault_handling, .setup_ioapic_entry = intel_setup_ioapic_entry, + .set_affinity = intel_ioapic_set_affinity, }; diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c index 739148ab2538..2f4f27ffb861 100644 --- a/drivers/iommu/intr_remapping.c +++ b/drivers/iommu/intr_remapping.c @@ -110,3 +110,12 @@ int intr_setup_ioapic_entry(int irq, return remap_ops->setup_ioapic_entry(irq, entry, destination, vector, attr); } + +int intr_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + if (!remap_ops || !remap_ops->set_affinity) + return 0; + + return remap_ops->set_affinity(data, mask, force); +} diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h index e8994f2b3bbe..e0bc6e0ba1fb 100644 --- a/drivers/iommu/intr_remapping.h +++ b/drivers/iommu/intr_remapping.h @@ -26,6 +26,8 @@ struct IO_APIC_route_entry; struct io_apic_irq_attr; +struct irq_data; +struct cpumask; extern int disable_intremap; extern int disable_sourceid_checking; @@ -54,6 +56,10 @@ struct irq_remap_ops { int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *, unsigned int, int, struct io_apic_irq_attr *); + + /* Set the CPU affinity of a remapped interrupt */ + int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, + bool force); }; extern struct irq_remap_ops intel_irq_remap_ops; -- cgit v1.2.3 From 9d619f65722236e0e0c35467d1528caed206e439 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2012 11:47:04 -0700 Subject: iommu/vt-d: Convert free_irte into a remap_ops callback The operation for releasing a remapping entry is iommu specific too. Signed-off-by: Joerg Roedel Acked-by: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Suresh Siddha Signed-off-by: Joerg Roedel --- arch/x86/include/asm/intr_remapping.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 2 +- drivers/iommu/intel_intr_remapping.c | 3 ++- drivers/iommu/intr_remapping.c | 8 ++++++++ drivers/iommu/intr_remapping.h | 3 +++ include/linux/dmar.h | 5 ----- 6 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h index ae933ecfd8f0..a195b7d6995c 100644 --- a/arch/x86/include/asm/intr_remapping.h +++ b/arch/x86/include/asm/intr_remapping.h @@ -43,6 +43,7 @@ extern int intr_setup_ioapic_entry(int irq, extern int intr_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force); +extern void intr_free_irq(int irq); #else /* CONFIG_IRQ_REMAP */ @@ -68,6 +69,7 @@ static inline int intr_set_affinity(struct irq_data *data, { return 0; } +static inline void intr_free_irq(int irq) { } #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_INTR_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a97c79aa25cf..5690469555fb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3041,7 +3041,7 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); if (irq_remapped(cfg)) - free_irte(irq); + intr_free_irq(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index 25372c1f3c8c..44a6e04a070b 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -253,7 +253,7 @@ static int clear_entries(struct irq_2_iommu *irq_iommu) return qi_flush_iec(iommu, index, irq_iommu->irte_mask); } -int free_irte(int irq) +static int free_irte(int irq) { struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); unsigned long flags; @@ -964,4 +964,5 @@ struct irq_remap_ops intel_irq_remap_ops = { .enable_faulting = enable_drhd_fault_handling, .setup_ioapic_entry = intel_setup_ioapic_entry, .set_affinity = intel_ioapic_set_affinity, + .free_irq = free_irte, }; diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c index 2f4f27ffb861..a68d304f9729 100644 --- a/drivers/iommu/intr_remapping.c +++ b/drivers/iommu/intr_remapping.c @@ -119,3 +119,11 @@ int intr_set_affinity(struct irq_data *data, const struct cpumask *mask, return remap_ops->set_affinity(data, mask, force); } + +void intr_free_irq(int irq) +{ + if (!remap_ops || !remap_ops->free_irq) + return; + + remap_ops->free_irq(irq); +} diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h index e0bc6e0ba1fb..57485539383d 100644 --- a/drivers/iommu/intr_remapping.h +++ b/drivers/iommu/intr_remapping.h @@ -60,6 +60,9 @@ struct irq_remap_ops { /* Set the CPU affinity of a remapped interrupt */ int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, bool force); + + /* Free an IRQ */ + int (*free_irq)(int); }; extern struct irq_remap_ops intel_irq_remap_ops; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index f2bd87f52a8d..7a207a39f879 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -121,7 +121,6 @@ extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count); extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 sub_handle); extern int map_irq_to_irte_handle(int irq, u16 *sub_handle); -extern int free_irte(int irq); extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev); extern struct intel_iommu *map_ioapic_to_ir(int apic); @@ -138,10 +137,6 @@ static inline int modify_irte(int irq, struct irte *irte_modified) { return -1; } -static inline int free_irte(int irq) -{ - return -1; -} static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle) { return -1; -- cgit v1.2.3 From 5e2b930b0784a30c98dee8e9d79c1f84c31f7209 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2012 11:47:05 -0700 Subject: iommu/vt-d: Convert MSI remapping setup to remap_ops This patch introduces remapping-ops for setting ups MSI interrupts. Signed-off-by: Joerg Roedel Acked-by: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Suresh Siddha Signed-off-by: Joerg Roedel --- arch/x86/include/asm/intr_remapping.h | 26 ++++++++ arch/x86/include/asm/irq_remapping.h | 23 ------- arch/x86/kernel/apic/io_apic.c | 119 ++++++++-------------------------- drivers/iommu/intel_intr_remapping.c | 97 +++++++++++++++++++++++++++ drivers/iommu/intr_remapping.c | 35 ++++++++++ drivers/iommu/intr_remapping.h | 16 +++++ 6 files changed, 202 insertions(+), 114 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h index a195b7d6995c..a6afd6efa6c6 100644 --- a/arch/x86/include/asm/intr_remapping.h +++ b/arch/x86/include/asm/intr_remapping.h @@ -26,6 +26,7 @@ struct IO_APIC_route_entry; struct io_apic_irq_attr; +struct pci_dev; extern int intr_remapping_enabled; @@ -44,6 +45,13 @@ extern int intr_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force); extern void intr_free_irq(int irq); +extern void intr_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id); +extern int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec); +extern int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle); +extern int intr_setup_hpet_msi(unsigned int irq, unsigned int id); #else /* CONFIG_IRQ_REMAP */ @@ -70,6 +78,24 @@ static inline int intr_set_affinity(struct irq_data *data, return 0; } static inline void intr_free_irq(int irq) { } +static inline void intr_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ +} +static inline int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) +{ + return -ENODEV; +} +static inline int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) +{ + return -ENODEV; +} +static inline int intr_setup_hpet_msi(unsigned int irq, unsigned int id) +{ + return -ENODEV; +} #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_INTR_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 47d99934580f..0ddfc0b90adb 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -5,34 +5,11 @@ #ifdef CONFIG_IRQ_REMAP static void irq_remap_modify_chip_defaults(struct irq_chip *chip); -static inline void prepare_irte(struct irte *irte, int vector, - unsigned int dest) -{ - memset(irte, 0, sizeof(*irte)); - - irte->present = 1; - irte->dst_mode = apic->irq_dest_mode; - /* - * Trigger mode in the IRTE will always be edge, and for IO-APIC, the - * actual level or edge trigger will be setup in the IO-APIC - * RTE. This will help simplify level triggered irq migration. - * For more details, see the comments (in io_apic.c) explainig IO-APIC - * irq migration in the presence of interrupt-remapping. - */ - irte->trigger_mode = 0; - irte->dlvry_mode = apic->irq_delivery_mode; - irte->vector = vector; - irte->dest_id = IRTE_DEST(dest); - irte->redir_hint = 1; -} static inline bool irq_remapped(struct irq_cfg *cfg) { return cfg->irq_2_iommu.iommu != NULL; } #else -static void prepare_irte(struct irte *irte, int vector, unsigned int dest) -{ -} static inline bool irq_remapped(struct irq_cfg *cfg) { return false; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5690469555fb..3db693bda91d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3070,54 +3070,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); if (irq_remapped(cfg)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - prepare_irte(&irte, cfg->vector, dest); - - /* Set source-id of interrupt request */ - if (pdev) - set_msi_sid(&irte, pdev); - else - set_hpet_sid(&irte, hpet_id); - - modify_irte(irq, &irte); + intr_compose_msi_msg(pdev, irq, dest, msg, hpet_id); + return err; + } + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else { - if (x2apic_enabled()) - msg->address_hi = MSI_ADDR_BASE_HI | - MSI_ADDR_EXT_DEST_ID(dest); - else - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } return err; } @@ -3160,33 +3140,6 @@ static struct irq_chip msi_chip = { .irq_retrigger = ioapic_retrigger_irq, }; -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} - static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { struct irq_chip *chip = &msi_chip; @@ -3217,7 +3170,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; struct msi_desc *msidesc; - struct intel_iommu *iommu = NULL; /* x86 doesn't support multiple MSI yet */ if (type == PCI_CAP_ID_MSI && nvec > 1) @@ -3239,23 +3191,15 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) * allocate the consecutive block of IRTE's * for 'nvec' */ - index = msi_alloc_irte(dev, irq, nvec); + index = intr_msi_alloc_irq(dev, irq, nvec); if (index < 0) { ret = index; goto error; } } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; + ret = intr_msi_setup_irq(dev, irq, index, sub_handle); + if (ret < 0) goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); } no_ir: ret = setup_msi_irq(dev, msidesc, irq); @@ -3374,14 +3318,7 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) int ret; if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_hpet_to_ir(id); - int index; - - if (!iommu) - return -1; - - index = alloc_irte(iommu, irq, 1); - if (index < 0) + if (!intr_setup_hpet_msi(irq, id)) return -1; } diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index 44a6e04a070b..a3bae67ec43c 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "intr_remapping.h" @@ -955,6 +956,98 @@ intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, return 0; } +static void intel_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg; + struct irte irte; + u16 sub_handle; + int ir_index; + + cfg = irq_get_chip_data(irq); + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + prepare_irte(&irte, cfg->vector, dest); + + /* Set source-id of interrupt request */ + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); +} + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} + +static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) +{ + struct intel_iommu *iommu; + + iommu = map_dev_to_ir(pdev); + if (!iommu) + return -ENOENT; + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + + return 0; +} + +static int intel_setup_hpet_msi(unsigned int irq, unsigned int id) +{ + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + + return 0; +} + struct irq_remap_ops intel_irq_remap_ops = { .supported = intel_intr_remapping_supported, .hardware_init = dmar_table_init, @@ -965,4 +1058,8 @@ struct irq_remap_ops intel_irq_remap_ops = { .setup_ioapic_entry = intel_setup_ioapic_entry, .set_affinity = intel_ioapic_set_affinity, .free_irq = free_irte, + .compose_msi_msg = intel_compose_msi_msg, + .msi_alloc_irq = intel_msi_alloc_irq, + .msi_setup_irq = intel_msi_setup_irq, + .setup_hpet_msi = intel_setup_hpet_msi, }; diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c index a68d304f9729..9dc179316ba1 100644 --- a/drivers/iommu/intr_remapping.c +++ b/drivers/iommu/intr_remapping.c @@ -127,3 +127,38 @@ void intr_free_irq(int irq) remap_ops->free_irq(irq); } + +void intr_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ + if (!remap_ops || !remap_ops->compose_msi_msg) + return; + + remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id); +} + +int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) +{ + if (!remap_ops || !remap_ops->msi_alloc_irq) + return -ENODEV; + + return remap_ops->msi_alloc_irq(pdev, irq, nvec); +} + +int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) +{ + if (!remap_ops || !remap_ops->msi_setup_irq) + return -ENODEV; + + return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle); +} + +int intr_setup_hpet_msi(unsigned int irq, unsigned int id) +{ + if (!remap_ops || !remap_ops->setup_hpet_msi) + return -ENODEV; + + return remap_ops->setup_hpet_msi(irq, id); +} diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h index 57485539383d..6f4ea0a387b1 100644 --- a/drivers/iommu/intr_remapping.h +++ b/drivers/iommu/intr_remapping.h @@ -28,6 +28,8 @@ struct IO_APIC_route_entry; struct io_apic_irq_attr; struct irq_data; struct cpumask; +struct pci_dev; +struct msi_msg; extern int disable_intremap; extern int disable_sourceid_checking; @@ -63,6 +65,20 @@ struct irq_remap_ops { /* Free an IRQ */ int (*free_irq)(int); + + /* Create MSI msg to use for interrupt remapping */ + void (*compose_msi_msg)(struct pci_dev *, + unsigned int, unsigned int, + struct msi_msg *, u8); + + /* Allocate remapping resources for MSI */ + int (*msi_alloc_irq)(struct pci_dev *, int, int); + + /* Setup the remapped MSI irq */ + int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int); + + /* Setup interrupt remapping for an HPET MSI */ + int (*setup_hpet_msi)(unsigned int, unsigned int); }; extern struct irq_remap_ops intel_irq_remap_ops; -- cgit v1.2.3 From 263b5e8629c9ce21c9cd4c0e29c097afb1c10ef3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 30 Mar 2012 11:47:06 -0700 Subject: x86, iommu/vt-d: Clean up interfaces for interrupt remapping Remove the Intel specific interfaces from dmar.h and remove asm/irq_remapping.h which is only used for io_apic.c anyway. Signed-off-by: Joerg Roedel Acked-by: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Suresh Siddha Signed-off-by: Joerg Roedel --- arch/x86/include/asm/irq_remapping.h | 22 -------------- arch/x86/kernel/apic/io_apic.c | 17 ++++++++++- drivers/iommu/intel_intr_remapping.c | 20 ++++++------ include/linux/dmar.h | 59 ------------------------------------ 4 files changed, 26 insertions(+), 92 deletions(-) delete mode 100644 arch/x86/include/asm/irq_remapping.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h deleted file mode 100644 index 0ddfc0b90adb..000000000000 --- a/arch/x86/include/asm/irq_remapping.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _ASM_X86_IRQ_REMAPPING_H -#define _ASM_X86_IRQ_REMAPPING_H - -#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) - -#ifdef CONFIG_IRQ_REMAP -static void irq_remap_modify_chip_defaults(struct irq_chip *chip); -static inline bool irq_remapped(struct irq_cfg *cfg) -{ - return cfg->irq_2_iommu.iommu != NULL; -} -#else -static inline bool irq_remapped(struct irq_cfg *cfg) -{ - return false; -} -static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) -{ -} -#endif - -#endif /* _ASM_X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3db693bda91d..073edd1d3c66 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include @@ -87,6 +86,22 @@ void __init set_io_apic_ops(const struct io_apic_ops *ops) io_apic_ops = *ops; } +#ifdef CONFIG_IRQ_REMAP +static void irq_remap_modify_chip_defaults(struct irq_chip *chip); +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return cfg->irq_2_iommu.iommu != NULL; +} +#else +static inline bool irq_remapped(struct irq_cfg *cfg) +{ + return false; +} +static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip) +{ +} +#endif + /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index a3bae67ec43c..7472634df350 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -64,7 +64,7 @@ int get_irte(int irq, struct irte *entry) return 0; } -int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) +static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) { struct ir_table *table = iommu->ir_table; struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); @@ -136,7 +136,7 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) return qi_submit_sync(&desc, iommu); } -int map_irq_to_irte_handle(int irq, u16 *sub_handle) +static int map_irq_to_irte_handle(int irq, u16 *sub_handle) { struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); unsigned long flags; @@ -152,7 +152,7 @@ int map_irq_to_irte_handle(int irq, u16 *sub_handle) return index; } -int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) +static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) { struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); unsigned long flags; @@ -172,7 +172,7 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) return 0; } -int modify_irte(int irq, struct irte *irte_modified) +static int modify_irte(int irq, struct irte *irte_modified) { struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); struct intel_iommu *iommu; @@ -200,7 +200,7 @@ int modify_irte(int irq, struct irte *irte_modified) return rc; } -struct intel_iommu *map_hpet_to_ir(u8 hpet_id) +static struct intel_iommu *map_hpet_to_ir(u8 hpet_id) { int i; @@ -210,7 +210,7 @@ struct intel_iommu *map_hpet_to_ir(u8 hpet_id) return NULL; } -struct intel_iommu *map_ioapic_to_ir(int apic) +static struct intel_iommu *map_ioapic_to_ir(int apic) { int i; @@ -220,7 +220,7 @@ struct intel_iommu *map_ioapic_to_ir(int apic) return NULL; } -struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) +static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) { struct dmar_drhd_unit *drhd; @@ -312,7 +312,7 @@ static void set_irte_sid(struct irte *irte, unsigned int svt, irte->sid = sid; } -int set_ioapic_sid(struct irte *irte, int apic) +static int set_ioapic_sid(struct irte *irte, int apic) { int i; u16 sid = 0; @@ -337,7 +337,7 @@ int set_ioapic_sid(struct irte *irte, int apic) return 0; } -int set_hpet_sid(struct irte *irte, u8 id) +static int set_hpet_sid(struct irte *irte, u8 id) { int i; u16 sid = 0; @@ -367,7 +367,7 @@ int set_hpet_sid(struct irte *irte, u8 id) return 0; } -int set_msi_sid(struct irte *irte, struct pci_dev *dev) +static int set_msi_sid(struct irte *irte, struct pci_dev *dev) { struct pci_dev *bridge; diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 7a207a39f879..b029d1aa2d12 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -114,65 +114,6 @@ struct irte { }; }; -#ifdef CONFIG_IRQ_REMAP -extern int get_irte(int irq, struct irte *entry); -extern int modify_irte(int irq, struct irte *irte_modified); -extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count); -extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, - u16 sub_handle); -extern int map_irq_to_irte_handle(int irq, u16 *sub_handle); - -extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev); -extern struct intel_iommu *map_ioapic_to_ir(int apic); -extern struct intel_iommu *map_hpet_to_ir(u8 id); -extern int set_ioapic_sid(struct irte *irte, int apic); -extern int set_hpet_sid(struct irte *irte, u8 id); -extern int set_msi_sid(struct irte *irte, struct pci_dev *dev); -#else -static inline int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) -{ - return -1; -} -static inline int modify_irte(int irq, struct irte *irte_modified) -{ - return -1; -} -static inline int map_irq_to_irte_handle(int irq, u16 *sub_handle) -{ - return -1; -} -static inline int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, - u16 sub_handle) -{ - return -1; -} -static inline struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) -{ - return NULL; -} -static inline struct intel_iommu *map_ioapic_to_ir(int apic) -{ - return NULL; -} -static inline struct intel_iommu *map_hpet_to_ir(unsigned int hpet_id) -{ - return NULL; -} -static inline int set_ioapic_sid(struct irte *irte, int apic) -{ - return 0; -} -static inline int set_hpet_sid(struct irte *irte, u8 id) -{ - return -1; -} -static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev) -{ - return 0; -} - -#endif - enum { IRQ_REMAP_XAPIC_MODE, IRQ_REMAP_X2APIC_MODE, -- cgit v1.2.3 From 95a02e976c39d63716b8c7c226bc530a2041536f Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 30 Mar 2012 11:47:07 -0700 Subject: iommu: rename intr_remapping references to irq_remapping Make the code consistent with the naming conventions of irq subsystem. Signed-off-by: Suresh Siddha Cc: Joerg Roedel Cc: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Joerg Roedel --- arch/ia64/include/asm/intr_remapping.h | 2 +- arch/x86/include/asm/intr_remapping.h | 94 +++++++++++++++++----------------- arch/x86/kernel/apic/apic.c | 22 ++++---- arch/x86/kernel/apic/io_apic.c | 41 +++++++-------- drivers/iommu/dmar.c | 8 +-- drivers/iommu/intel-iommu.c | 2 +- drivers/iommu/intel_intr_remapping.c | 40 +++++++-------- drivers/iommu/intr_remapping.c | 74 +++++++++++++------------- drivers/iommu/intr_remapping.h | 10 ++-- 9 files changed, 148 insertions(+), 145 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/intr_remapping.h b/arch/ia64/include/asm/intr_remapping.h index 095aa0d46c58..a8687b1d8906 100644 --- a/arch/ia64/include/asm/intr_remapping.h +++ b/arch/ia64/include/asm/intr_remapping.h @@ -1,4 +1,4 @@ #ifndef __IA64_INTR_REMAPPING_H #define __IA64_INTR_REMAPPING_H -#define intr_remapping_enabled 0 +#define irq_remapping_enabled 0 #endif diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h index a6afd6efa6c6..f9cbbcb2956e 100644 --- a/arch/x86/include/asm/intr_remapping.h +++ b/arch/x86/include/asm/intr_remapping.h @@ -28,71 +28,73 @@ struct IO_APIC_route_entry; struct io_apic_irq_attr; struct pci_dev; -extern int intr_remapping_enabled; +extern int irq_remapping_enabled; -extern void setup_intr_remapping(void); -extern int intr_remapping_supported(void); -extern int intr_hardware_init(void); -extern int intr_hardware_enable(void); -extern void intr_hardware_disable(void); -extern int intr_hardware_reenable(int); -extern int intr_enable_fault_handling(void); -extern int intr_setup_ioapic_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr); -extern int intr_set_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force); -extern void intr_free_irq(int irq); -extern void intr_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); -extern int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec); -extern int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle); -extern int intr_setup_hpet_msi(unsigned int irq, unsigned int id); +extern void setup_irq_remapping_ops(void); +extern int irq_remapping_supported(void); +extern int irq_remapping_prepare(void); +extern int irq_remapping_enable(void); +extern void irq_remapping_disable(void); +extern int irq_remapping_reenable(int); +extern int irq_remap_enable_fault_handling(void); +extern int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, + int vector, + struct io_apic_irq_attr *attr); +extern int set_remapped_irq_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force); +extern void free_remapped_irq(int irq); +extern void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id); +extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec); +extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle); +extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); #else /* CONFIG_IRQ_REMAP */ -#define intr_remapping_enabled 0 +#define irq_remapping_enabled 0 -static inline void setup_intr_remapping(void) { } -static inline int intr_remapping_supported(void) { return 0; } -static inline int intr_hardware_init(void) { return -ENODEV; } -static inline int intr_hardware_enable(void) { return -ENODEV; } -static inline void intr_hardware_disable(void) { } -static inline int intr_hardware_reenable(int eim) { return -ENODEV; } -static inline int intr_enable_fault_handling(void) { return -ENODEV; } -static inline int intr_setup_ioapic_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) +static inline void setup_irq_remapping_ops(void) { } +static inline int irq_remapping_supported(void) { return 0; } +static inline int irq_remapping_prepare(void) { return -ENODEV; } +static inline int irq_remapping_enable(void) { return -ENODEV; } +static inline void irq_remapping_disable(void) { } +static inline int irq_remapping_reenable(int eim) { return -ENODEV; } +static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } +static inline int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, + int vector, + struct io_apic_irq_attr *attr) { return -ENODEV; } -static inline int intr_set_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) +static inline int set_remapped_irq_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force) { return 0; } -static inline void intr_free_irq(int irq) { } -static inline void intr_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +static inline void free_remapped_irq(int irq) { } +static inline void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) { } -static inline int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) +static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) { return -ENODEV; } -static inline int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) +static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) { return -ENODEV; } -static inline int intr_setup_hpet_msi(unsigned int irq, unsigned int id) +static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) { return -ENODEV; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a2762687e2ee..c02c666c4628 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1442,8 +1442,8 @@ void __init bsp_end_local_APIC_setup(void) * Now that local APIC setup is completed for BP, configure the fault * handling for interrupt remapping. */ - if (intr_remapping_enabled) - intr_enable_fault_handling(); + if (irq_remapping_enabled) + irq_remap_enable_fault_handling(); } @@ -1518,7 +1518,7 @@ void enable_x2apic(void) int __init enable_IR(void) { #ifdef CONFIG_IRQ_REMAP - if (!intr_remapping_supported()) { + if (!irq_remapping_supported()) { pr_debug("intr-remapping not supported\n"); return -1; } @@ -1529,7 +1529,7 @@ int __init enable_IR(void) return -1; } - return intr_hardware_enable(); + return irq_remapping_enable(); #endif return -1; } @@ -1541,9 +1541,9 @@ void __init enable_IR_x2apic(void) int hardware_init_ret; /* Make sure irq_remap_ops are initialized */ - setup_intr_remapping(); + setup_irq_remapping_ops(); - hardware_init_ret = intr_hardware_init(); + hardware_init_ret = irq_remapping_prepare(); if (hardware_init_ret && !x2apic_supported()) return; @@ -2180,8 +2180,8 @@ static int lapic_suspend(void) local_irq_save(flags); disable_local_APIC(); - if (intr_remapping_enabled) - intr_hardware_disable(); + if (irq_remapping_enabled) + irq_remapping_disable(); local_irq_restore(flags); return 0; @@ -2197,7 +2197,7 @@ static void lapic_resume(void) return; local_irq_save(flags); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { /* * IO-APIC and PIC have their own resume routines. * We just mask them here to make sure the interrupt @@ -2249,8 +2249,8 @@ static void lapic_resume(void) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (intr_remapping_enabled) - intr_hardware_reenable(x2apic_mode); + if (irq_remapping_enabled) + irq_remapping_reenable(x2apic_mode); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 073edd1d3c66..abbbcd4d1d71 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1381,9 +1381,9 @@ static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, unsigned int destination, int vector, struct io_apic_irq_attr *attr) { - if (intr_remapping_enabled) - return intr_setup_ioapic_entry(irq, entry, destination, - vector, attr); + if (irq_remapping_enabled) + return setup_ioapic_remapped_entry(irq, entry, destination, + vector, attr); memset(entry, 0, sizeof(*entry)); @@ -1540,7 +1540,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, { struct IO_APIC_route_entry entry; - if (intr_remapping_enabled) + if (irq_remapping_enabled) return; memset(&entry, 0, sizeof(entry)); @@ -1626,7 +1626,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" " Pol Stat Indx2 Zero Vect:\n"); } else { @@ -1635,7 +1635,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) } for (i = 0; i <= reg_01.bits.entries; i++) { - if (intr_remapping_enabled) { + if (irq_remapping_enabled) { struct IO_APIC_route_entry entry; struct IR_IO_APIC_route_entry *ir_entry; @@ -2002,7 +2002,7 @@ void disable_IO_APIC(void) * IOAPIC RTE as well as interrupt-remapping table entry). * As this gets called during crash dump, keep this simple for now. */ - if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { + if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); @@ -2026,7 +2026,7 @@ void disable_IO_APIC(void) * Use virtual wire A mode when interrupt remapping is enabled. */ if (cpu_has_apic || apic_from_smp_config()) - disconnect_bsp_APIC(!intr_remapping_enabled && + disconnect_bsp_APIC(!irq_remapping_enabled && ioapic_i8259.pin != -1); } @@ -2586,7 +2586,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_eoi = ir_ack_apic_level; #ifdef CONFIG_SMP - chip->irq_set_affinity = intr_set_affinity; + chip->irq_set_affinity = set_remapped_irq_affinity; #endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2799,7 +2799,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; @@ -2832,7 +2832,7 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } - if (intr_remapping_enabled) + if (irq_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); @@ -3056,7 +3056,7 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); if (irq_remapped(cfg)) - intr_free_irq(irq); + free_remapped_irq(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); raw_spin_unlock_irqrestore(&vector_lock, flags); @@ -3085,7 +3085,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); if (irq_remapped(cfg)) { - intr_compose_msi_msg(pdev, irq, dest, msg, hpet_id); + compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); return err; } @@ -3198,7 +3198,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (irq == 0) return -1; irq_want = irq + 1; - if (!intr_remapping_enabled) + if (!irq_remapping_enabled) goto no_ir; if (!sub_handle) { @@ -3206,13 +3206,14 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) * allocate the consecutive block of IRTE's * for 'nvec' */ - index = intr_msi_alloc_irq(dev, irq, nvec); + index = msi_alloc_remapped_irq(dev, irq, nvec); if (index < 0) { ret = index; goto error; } } else { - ret = intr_msi_setup_irq(dev, irq, index, sub_handle); + ret = msi_setup_remapped_irq(dev, irq, index, + sub_handle); if (ret < 0) goto error; } @@ -3332,8 +3333,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) struct msi_msg msg; int ret; - if (intr_remapping_enabled) { - if (!intr_setup_hpet_msi(irq, id)) + if (irq_remapping_enabled) { + if (!setup_hpet_msi_remapped(irq, id)) return -1; } @@ -3712,8 +3713,8 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - if (intr_remapping_enabled) - intr_set_affinity(idata, mask, false); + if (irq_remapping_enabled) + set_remapped_irq_affinity(idata, mask, false); else ioapic_set_affinity(idata, mask, false); } diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 647e366403dc..ee74f698eef8 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -556,7 +556,7 @@ int __init detect_intel_iommu(void) dmar = (struct acpi_table_dmar *) dmar_tbl; - if (ret && intr_remapping_enabled && cpu_has_x2apic && + if (ret && irq_remapping_enabled && cpu_has_x2apic && dmar->flags & 0x1) printk(KERN_INFO "Queued invalidation will be enabled to support x2apic and Intr-remapping.\n"); @@ -1042,7 +1042,7 @@ static const char *dma_remap_fault_reasons[] = "non-zero reserved fields in PTE", }; -static const char *intr_remap_fault_reasons[] = +static const char *irq_remap_fault_reasons[] = { "Detected reserved fields in the decoded interrupt-remapped request", "Interrupt index exceeded the interrupt-remapping table size", @@ -1058,9 +1058,9 @@ static const char *intr_remap_fault_reasons[] = const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type) { if (fault_reason >= 0x20 && (fault_reason <= 0x20 + - ARRAY_SIZE(intr_remap_fault_reasons))) { + ARRAY_SIZE(irq_remap_fault_reasons))) { *fault_type = INTR_REMAP; - return intr_remap_fault_reasons[fault_reason - 0x20]; + return irq_remap_fault_reasons[fault_reason - 0x20]; } else if (fault_reason < ARRAY_SIZE(dma_remap_fault_reasons)) { *fault_type = DMA_REMAP; return dma_remap_fault_reasons[fault_reason]; diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index e1439808192c..cef5b8226f3d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -4083,7 +4083,7 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain, if (cap == IOMMU_CAP_CACHE_COHERENCY) return dmar_domain->iommu_snooping; if (cap == IOMMU_CAP_INTR_REMAP) - return intr_remapping_enabled; + return irq_remapping_enabled; return 0; } diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c index 7472634df350..efeb601c782f 100644 --- a/drivers/iommu/intel_intr_remapping.c +++ b/drivers/iommu/intel_intr_remapping.c @@ -394,7 +394,7 @@ static int set_msi_sid(struct irte *irte, struct pci_dev *dev) return 0; } -static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) +static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode) { u64 addr; u32 sts; @@ -434,7 +434,7 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode) } -static int intel_setup_intr_remapping(struct intel_iommu *iommu, int mode) +static int intel_setup_irq_remapping(struct intel_iommu *iommu, int mode) { struct ir_table *ir_table; struct page *pages; @@ -457,14 +457,14 @@ static int intel_setup_intr_remapping(struct intel_iommu *iommu, int mode) ir_table->base = page_address(pages); - iommu_set_intr_remapping(iommu, mode); + iommu_set_irq_remapping(iommu, mode); return 0; } /* * Disable Interrupt Remapping. */ -static void iommu_disable_intr_remapping(struct intel_iommu *iommu) +static void iommu_disable_irq_remapping(struct intel_iommu *iommu) { unsigned long flags; u32 sts; @@ -503,11 +503,11 @@ static int __init dmar_x2apic_optout(void) return dmar->flags & DMAR_X2APIC_OPT_OUT; } -static int __init intel_intr_remapping_supported(void) +static int __init intel_irq_remapping_supported(void) { struct dmar_drhd_unit *drhd; - if (disable_intremap) + if (disable_irq_remap) return 0; if (!dmar_ir_support()) @@ -523,7 +523,7 @@ static int __init intel_intr_remapping_supported(void) return 1; } -static int __init intel_enable_intr_remapping(void) +static int __init intel_enable_irq_remapping(void) { struct dmar_drhd_unit *drhd; int setup = 0; @@ -561,7 +561,7 @@ static int __init intel_enable_intr_remapping(void) * Disable intr remapping and queued invalidation, if already * enabled prior to OS handover. */ - iommu_disable_intr_remapping(iommu); + iommu_disable_irq_remapping(iommu); dmar_disable_qi(iommu); } @@ -607,7 +607,7 @@ static int __init intel_enable_intr_remapping(void) if (!ecap_ir_support(iommu->ecap)) continue; - if (intel_setup_intr_remapping(iommu, eim)) + if (intel_setup_irq_remapping(iommu, eim)) goto error; setup = 1; @@ -616,7 +616,7 @@ static int __init intel_enable_intr_remapping(void) if (!setup) goto error; - intr_remapping_enabled = 1; + irq_remapping_enabled = 1; pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic"); return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE; @@ -759,14 +759,14 @@ int __init parse_ioapics_under_ir(void) int __init ir_dev_scope_init(void) { - if (!intr_remapping_enabled) + if (!irq_remapping_enabled) return 0; return dmar_dev_scope_init(); } rootfs_initcall(ir_dev_scope_init); -static void disable_intr_remapping(void) +static void disable_irq_remapping(void) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu = NULL; @@ -778,11 +778,11 @@ static void disable_intr_remapping(void) if (!ecap_ir_support(iommu->ecap)) continue; - iommu_disable_intr_remapping(iommu); + iommu_disable_irq_remapping(iommu); } } -static int reenable_intr_remapping(int eim) +static int reenable_irq_remapping(int eim) { struct dmar_drhd_unit *drhd; int setup = 0; @@ -800,7 +800,7 @@ static int reenable_intr_remapping(int eim) continue; /* Set up interrupt remapping for iommu.*/ - iommu_set_intr_remapping(iommu, eim); + iommu_set_irq_remapping(iommu, eim); setup = 1; } @@ -1049,11 +1049,11 @@ static int intel_setup_hpet_msi(unsigned int irq, unsigned int id) } struct irq_remap_ops intel_irq_remap_ops = { - .supported = intel_intr_remapping_supported, - .hardware_init = dmar_table_init, - .hardware_enable = intel_enable_intr_remapping, - .hardware_disable = disable_intr_remapping, - .hardware_reenable = reenable_intr_remapping, + .supported = intel_irq_remapping_supported, + .prepare = dmar_table_init, + .enable = intel_enable_irq_remapping, + .disable = disable_irq_remapping, + .reenable = reenable_irq_remapping, .enable_faulting = enable_drhd_fault_handling, .setup_ioapic_entry = intel_setup_ioapic_entry, .set_affinity = intel_ioapic_set_affinity, diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c index 9dc179316ba1..523a7b3a1205 100644 --- a/drivers/iommu/intr_remapping.c +++ b/drivers/iommu/intr_remapping.c @@ -4,9 +4,9 @@ #include "intr_remapping.h" -int intr_remapping_enabled; +int irq_remapping_enabled; -int disable_intremap; +int disable_irq_remap; int disable_sourceid_checking; int no_x2apic_optout; @@ -14,21 +14,21 @@ static struct irq_remap_ops *remap_ops; static __init int setup_nointremap(char *str) { - disable_intremap = 1; + disable_irq_remap = 1; return 0; } early_param("nointremap", setup_nointremap); -static __init int setup_intremap(char *str) +static __init int setup_irqremap(char *str) { if (!str) return -EINVAL; while (*str) { if (!strncmp(str, "on", 2)) - disable_intremap = 0; + disable_irq_remap = 0; else if (!strncmp(str, "off", 3)) - disable_intremap = 1; + disable_irq_remap = 1; else if (!strncmp(str, "nosid", 5)) disable_sourceid_checking = 1; else if (!strncmp(str, "no_x2apic_optout", 16)) @@ -41,16 +41,16 @@ static __init int setup_intremap(char *str) return 0; } -early_param("intremap", setup_intremap); +early_param("intremap", setup_irqremap); -void __init setup_intr_remapping(void) +void __init setup_irq_remapping_ops(void) { remap_ops = &intel_irq_remap_ops; } -int intr_remapping_supported(void) +int irq_remapping_supported(void) { - if (disable_intremap) + if (disable_irq_remap) return 0; if (!remap_ops || !remap_ops->supported) @@ -59,39 +59,39 @@ int intr_remapping_supported(void) return remap_ops->supported(); } -int __init intr_hardware_init(void) +int __init irq_remapping_prepare(void) { - if (!remap_ops || !remap_ops->hardware_init) + if (!remap_ops || !remap_ops->prepare) return -ENODEV; - return remap_ops->hardware_init(); + return remap_ops->prepare(); } -int __init intr_hardware_enable(void) +int __init irq_remapping_enable(void) { - if (!remap_ops || !remap_ops->hardware_enable) + if (!remap_ops || !remap_ops->enable) return -ENODEV; - return remap_ops->hardware_enable(); + return remap_ops->enable(); } -void intr_hardware_disable(void) +void irq_remapping_disable(void) { - if (!remap_ops || !remap_ops->hardware_disable) + if (!remap_ops || !remap_ops->disable) return; - remap_ops->hardware_disable(); + remap_ops->disable(); } -int intr_hardware_reenable(int mode) +int irq_remapping_reenable(int mode) { - if (!remap_ops || !remap_ops->hardware_reenable) + if (!remap_ops || !remap_ops->reenable) return 0; - return remap_ops->hardware_reenable(mode); + return remap_ops->reenable(mode); } -int __init intr_enable_fault_handling(void) +int __init irq_remap_enable_fault_handling(void) { if (!remap_ops || !remap_ops->enable_faulting) return -ENODEV; @@ -99,10 +99,10 @@ int __init intr_enable_fault_handling(void) return remap_ops->enable_faulting(); } -int intr_setup_ioapic_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) +int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) { if (!remap_ops || !remap_ops->setup_ioapic_entry) return -ENODEV; @@ -111,8 +111,8 @@ int intr_setup_ioapic_entry(int irq, vector, attr); } -int intr_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) +int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) { if (!remap_ops || !remap_ops->set_affinity) return 0; @@ -120,7 +120,7 @@ int intr_set_affinity(struct irq_data *data, const struct cpumask *mask, return remap_ops->set_affinity(data, mask, force); } -void intr_free_irq(int irq) +void free_remapped_irq(int irq) { if (!remap_ops || !remap_ops->free_irq) return; @@ -128,9 +128,9 @@ void intr_free_irq(int irq) remap_ops->free_irq(irq); } -void intr_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) +void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) { if (!remap_ops || !remap_ops->compose_msi_msg) return; @@ -138,7 +138,7 @@ void intr_compose_msi_msg(struct pci_dev *pdev, remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id); } -int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) +int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) { if (!remap_ops || !remap_ops->msi_alloc_irq) return -ENODEV; @@ -146,8 +146,8 @@ int intr_msi_alloc_irq(struct pci_dev *pdev, int irq, int nvec) return remap_ops->msi_alloc_irq(pdev, irq, nvec); } -int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) +int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) { if (!remap_ops || !remap_ops->msi_setup_irq) return -ENODEV; @@ -155,7 +155,7 @@ int intr_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle); } -int intr_setup_hpet_msi(unsigned int irq, unsigned int id) +int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) { if (!remap_ops || !remap_ops->setup_hpet_msi) return -ENODEV; diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h index 6f4ea0a387b1..bd5d98fec148 100644 --- a/drivers/iommu/intr_remapping.h +++ b/drivers/iommu/intr_remapping.h @@ -31,7 +31,7 @@ struct cpumask; struct pci_dev; struct msi_msg; -extern int disable_intremap; +extern int disable_irq_remap; extern int disable_sourceid_checking; extern int no_x2apic_optout; @@ -40,16 +40,16 @@ struct irq_remap_ops { int (*supported)(void); /* Initializes hardware and makes it ready for remapping interrupts */ - int (*hardware_init)(void); + int (*prepare)(void); /* Enables the remapping hardware */ - int (*hardware_enable)(void); + int (*enable)(void); /* Disables the remapping hardware */ - void (*hardware_disable)(void); + void (*disable)(void); /* Reenables the remapping hardware */ - int (*hardware_reenable)(int); + int (*reenable)(int); /* Enable fault handling */ int (*enable_faulting)(void); -- cgit v1.2.3 From 8a8f422d3b4f2cde8e0e1d31638279a26a886a82 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 30 Mar 2012 11:47:08 -0700 Subject: iommu: rename intr_remapping.[ch] to irq_remapping.[ch] Make the file names consistent with the naming conventions of irq subsystem. Signed-off-by: Suresh Siddha Cc: Joerg Roedel Cc: Yinghai Lu Cc: David Woodhouse Cc: Alex Williamson Signed-off-by: Joerg Roedel --- arch/ia64/include/asm/intr_remapping.h | 4 - arch/ia64/include/asm/irq_remapping.h | 4 + arch/x86/include/asm/intr_remapping.h | 103 --- arch/x86/include/asm/irq_remapping.h | 103 +++ arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 2 +- drivers/iommu/Makefile | 2 +- drivers/iommu/dmar.c | 2 +- drivers/iommu/intel-iommu.c | 2 +- drivers/iommu/intel_intr_remapping.c | 1065 -------------------------------- drivers/iommu/intel_irq_remapping.c | 1065 ++++++++++++++++++++++++++++++++ drivers/iommu/intr_remapping.c | 164 ----- drivers/iommu/intr_remapping.h | 88 --- drivers/iommu/irq_remapping.c | 164 +++++ drivers/iommu/irq_remapping.h | 88 +++ 15 files changed, 1429 insertions(+), 1429 deletions(-) delete mode 100644 arch/ia64/include/asm/intr_remapping.h create mode 100644 arch/ia64/include/asm/irq_remapping.h delete mode 100644 arch/x86/include/asm/intr_remapping.h create mode 100644 arch/x86/include/asm/irq_remapping.h delete mode 100644 drivers/iommu/intel_intr_remapping.c create mode 100644 drivers/iommu/intel_irq_remapping.c delete mode 100644 drivers/iommu/intr_remapping.c delete mode 100644 drivers/iommu/intr_remapping.h create mode 100644 drivers/iommu/irq_remapping.c create mode 100644 drivers/iommu/irq_remapping.h (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/intr_remapping.h b/arch/ia64/include/asm/intr_remapping.h deleted file mode 100644 index a8687b1d8906..000000000000 --- a/arch/ia64/include/asm/intr_remapping.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __IA64_INTR_REMAPPING_H -#define __IA64_INTR_REMAPPING_H -#define irq_remapping_enabled 0 -#endif diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h new file mode 100644 index 000000000000..a8687b1d8906 --- /dev/null +++ b/arch/ia64/include/asm/irq_remapping.h @@ -0,0 +1,4 @@ +#ifndef __IA64_INTR_REMAPPING_H +#define __IA64_INTR_REMAPPING_H +#define irq_remapping_enabled 0 +#endif diff --git a/arch/x86/include/asm/intr_remapping.h b/arch/x86/include/asm/intr_remapping.h deleted file mode 100644 index f9cbbcb2956e..000000000000 --- a/arch/x86/include/asm/intr_remapping.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (C) 2012 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * This header file contains the interface of the interrupt remapping code to - * the x86 interrupt management code. - */ - -#ifndef __X86_INTR_REMAPPING_H -#define __X86_INTR_REMAPPING_H - -#ifdef CONFIG_IRQ_REMAP - -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct pci_dev; - -extern int irq_remapping_enabled; - -extern void setup_irq_remapping_ops(void); -extern int irq_remapping_supported(void); -extern int irq_remapping_prepare(void); -extern int irq_remapping_enable(void); -extern void irq_remapping_disable(void); -extern int irq_remapping_reenable(int); -extern int irq_remap_enable_fault_handling(void); -extern int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr); -extern int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force); -extern void free_remapped_irq(int irq); -extern void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); -extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec); -extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle); -extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); - -#else /* CONFIG_IRQ_REMAP */ - -#define irq_remapping_enabled 0 - -static inline void setup_irq_remapping_ops(void) { } -static inline int irq_remapping_supported(void) { return 0; } -static inline int irq_remapping_prepare(void) { return -ENODEV; } -static inline int irq_remapping_enable(void) { return -ENODEV; } -static inline void irq_remapping_disable(void) { } -static inline int irq_remapping_reenable(int eim) { return -ENODEV; } -static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } -static inline int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, - int vector, - struct io_apic_irq_attr *attr) -{ - return -ENODEV; -} -static inline int set_remapped_irq_affinity(struct irq_data *data, - const struct cpumask *mask, - bool force) -{ - return 0; -} -static inline void free_remapped_irq(int irq) { } -static inline void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ -} -static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) -{ - return -ENODEV; -} -static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) -{ - return -ENODEV; -} -static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - return -ENODEV; -} -#endif /* CONFIG_IRQ_REMAP */ - -#endif /* __X86_INTR_REMAPPING_H */ diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h new file mode 100644 index 000000000000..dcb0c7231028 --- /dev/null +++ b/arch/x86/include/asm/irq_remapping.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2012 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This header file contains the interface of the interrupt remapping code to + * the x86 interrupt management code. + */ + +#ifndef __X86_IRQ_REMAPPING_H +#define __X86_IRQ_REMAPPING_H + +#ifdef CONFIG_IRQ_REMAP + +struct IO_APIC_route_entry; +struct io_apic_irq_attr; +struct pci_dev; + +extern int irq_remapping_enabled; + +extern void setup_irq_remapping_ops(void); +extern int irq_remapping_supported(void); +extern int irq_remapping_prepare(void); +extern int irq_remapping_enable(void); +extern void irq_remapping_disable(void); +extern int irq_remapping_reenable(int); +extern int irq_remap_enable_fault_handling(void); +extern int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, + int vector, + struct io_apic_irq_attr *attr); +extern int set_remapped_irq_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force); +extern void free_remapped_irq(int irq); +extern void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id); +extern int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec); +extern int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle); +extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id); + +#else /* CONFIG_IRQ_REMAP */ + +#define irq_remapping_enabled 0 + +static inline void setup_irq_remapping_ops(void) { } +static inline int irq_remapping_supported(void) { return 0; } +static inline int irq_remapping_prepare(void) { return -ENODEV; } +static inline int irq_remapping_enable(void) { return -ENODEV; } +static inline void irq_remapping_disable(void) { } +static inline int irq_remapping_reenable(int eim) { return -ENODEV; } +static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; } +static inline int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, + int vector, + struct io_apic_irq_attr *attr) +{ + return -ENODEV; +} +static inline int set_remapped_irq_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force) +{ + return 0; +} +static inline void free_remapped_irq(int irq) { } +static inline void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ +} +static inline int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) +{ + return -ENODEV; +} +static inline int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) +{ + return -ENODEV; +} +static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) +{ + return -ENODEV; +} +#endif /* CONFIG_IRQ_REMAP */ + +#endif /* __X86_IRQ_REMAPPING_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c02c666c4628..3722179a49db 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index abbbcd4d1d71..ef0648cd7084 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 823e1cf8708f..3e5e82ae9f0d 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o -obj-$(CONFIG_IRQ_REMAP) += intel_intr_remapping.o intr_remapping.o +obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o obj-$(CONFIG_OMAP_IOVMM) += omap-iovmm.o obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index ee74f698eef8..5ef65cf66152 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #define PREFIX "DMAR: " diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index cef5b8226f3d..bf2fbaad5e22 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/iommu/intel_intr_remapping.c b/drivers/iommu/intel_intr_remapping.c deleted file mode 100644 index efeb601c782f..000000000000 --- a/drivers/iommu/intel_intr_remapping.c +++ /dev/null @@ -1,1065 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "intr_remapping.h" - -struct ioapic_scope { - struct intel_iommu *iommu; - unsigned int id; - unsigned int bus; /* PCI bus number */ - unsigned int devfn; /* PCI devfn number */ -}; - -struct hpet_scope { - struct intel_iommu *iommu; - u8 id; - unsigned int bus; - unsigned int devfn; -}; - -#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) -#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) - -static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; -static struct hpet_scope ir_hpet[MAX_HPET_TBS]; -static int ir_ioapic_num, ir_hpet_num; - -static DEFINE_RAW_SPINLOCK(irq_2_ir_lock); - -static struct irq_2_iommu *irq_2_iommu(unsigned int irq) -{ - struct irq_cfg *cfg = irq_get_chip_data(irq); - return cfg ? &cfg->irq_2_iommu : NULL; -} - -int get_irte(int irq, struct irte *entry) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - unsigned long flags; - int index; - - if (!entry || !irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - - index = irq_iommu->irte_index + irq_iommu->sub_handle; - *entry = *(irq_iommu->iommu->ir_table->base + index); - - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - return 0; -} - -static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) -{ - struct ir_table *table = iommu->ir_table; - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - u16 index, start_index; - unsigned int mask = 0; - unsigned long flags; - int i; - - if (!count || !irq_iommu) - return -1; - - /* - * start the IRTE search from index 0. - */ - index = start_index = 0; - - if (count > 1) { - count = __roundup_pow_of_two(count); - mask = ilog2(count); - } - - if (mask > ecap_max_handle_mask(iommu->ecap)) { - printk(KERN_ERR - "Requested mask %x exceeds the max invalidation handle" - " mask value %Lx\n", mask, - ecap_max_handle_mask(iommu->ecap)); - return -1; - } - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - do { - for (i = index; i < index + count; i++) - if (table->base[i].present) - break; - /* empty index found */ - if (i == index + count) - break; - - index = (index + count) % INTR_REMAP_TABLE_ENTRIES; - - if (index == start_index) { - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - printk(KERN_ERR "can't allocate an IRTE\n"); - return -1; - } - } while (1); - - for (i = index; i < index + count; i++) - table->base[i].present = 1; - - irq_iommu->iommu = iommu; - irq_iommu->irte_index = index; - irq_iommu->sub_handle = 0; - irq_iommu->irte_mask = mask; - - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - - return index; -} - -static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) -{ - struct qi_desc desc; - - desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask) - | QI_IEC_SELECTIVE; - desc.high = 0; - - return qi_submit_sync(&desc, iommu); -} - -static int map_irq_to_irte_handle(int irq, u16 *sub_handle) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - unsigned long flags; - int index; - - if (!irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - *sub_handle = irq_iommu->sub_handle; - index = irq_iommu->irte_index; - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - return index; -} - -static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - unsigned long flags; - - if (!irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - - irq_iommu->iommu = iommu; - irq_iommu->irte_index = index; - irq_iommu->sub_handle = subhandle; - irq_iommu->irte_mask = 0; - - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - - return 0; -} - -static int modify_irte(int irq, struct irte *irte_modified) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - struct intel_iommu *iommu; - unsigned long flags; - struct irte *irte; - int rc, index; - - if (!irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - - iommu = irq_iommu->iommu; - - index = irq_iommu->irte_index + irq_iommu->sub_handle; - irte = &iommu->ir_table->base[index]; - - set_64bit(&irte->low, irte_modified->low); - set_64bit(&irte->high, irte_modified->high); - __iommu_flush_cache(iommu, irte, sizeof(*irte)); - - rc = qi_flush_iec(iommu, index, 0); - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - - return rc; -} - -static struct intel_iommu *map_hpet_to_ir(u8 hpet_id) -{ - int i; - - for (i = 0; i < MAX_HPET_TBS; i++) - if (ir_hpet[i].id == hpet_id) - return ir_hpet[i].iommu; - return NULL; -} - -static struct intel_iommu *map_ioapic_to_ir(int apic) -{ - int i; - - for (i = 0; i < MAX_IO_APICS; i++) - if (ir_ioapic[i].id == apic) - return ir_ioapic[i].iommu; - return NULL; -} - -static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) -{ - struct dmar_drhd_unit *drhd; - - drhd = dmar_find_matched_drhd_unit(dev); - if (!drhd) - return NULL; - - return drhd->iommu; -} - -static int clear_entries(struct irq_2_iommu *irq_iommu) -{ - struct irte *start, *entry, *end; - struct intel_iommu *iommu; - int index; - - if (irq_iommu->sub_handle) - return 0; - - iommu = irq_iommu->iommu; - index = irq_iommu->irte_index + irq_iommu->sub_handle; - - start = iommu->ir_table->base + index; - end = start + (1 << irq_iommu->irte_mask); - - for (entry = start; entry < end; entry++) { - set_64bit(&entry->low, 0); - set_64bit(&entry->high, 0); - } - - return qi_flush_iec(iommu, index, irq_iommu->irte_mask); -} - -static int free_irte(int irq) -{ - struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); - unsigned long flags; - int rc; - - if (!irq_iommu) - return -1; - - raw_spin_lock_irqsave(&irq_2_ir_lock, flags); - - rc = clear_entries(irq_iommu); - - irq_iommu->iommu = NULL; - irq_iommu->irte_index = 0; - irq_iommu->sub_handle = 0; - irq_iommu->irte_mask = 0; - - raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); - - return rc; -} - -/* - * source validation type - */ -#define SVT_NO_VERIFY 0x0 /* no verification is required */ -#define SVT_VERIFY_SID_SQ 0x1 /* verify using SID and SQ fields */ -#define SVT_VERIFY_BUS 0x2 /* verify bus of request-id */ - -/* - * source-id qualifier - */ -#define SQ_ALL_16 0x0 /* verify all 16 bits of request-id */ -#define SQ_13_IGNORE_1 0x1 /* verify most significant 13 bits, ignore - * the third least significant bit - */ -#define SQ_13_IGNORE_2 0x2 /* verify most significant 13 bits, ignore - * the second and third least significant bits - */ -#define SQ_13_IGNORE_3 0x3 /* verify most significant 13 bits, ignore - * the least three significant bits - */ - -/* - * set SVT, SQ and SID fields of irte to verify - * source ids of interrupt requests - */ -static void set_irte_sid(struct irte *irte, unsigned int svt, - unsigned int sq, unsigned int sid) -{ - if (disable_sourceid_checking) - svt = SVT_NO_VERIFY; - irte->svt = svt; - irte->sq = sq; - irte->sid = sid; -} - -static int set_ioapic_sid(struct irte *irte, int apic) -{ - int i; - u16 sid = 0; - - if (!irte) - return -1; - - for (i = 0; i < MAX_IO_APICS; i++) { - if (ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; - break; - } - } - - if (sid == 0) { - pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic); - return -1; - } - - set_irte_sid(irte, 1, 0, sid); - - return 0; -} - -static int set_hpet_sid(struct irte *irte, u8 id) -{ - int i; - u16 sid = 0; - - if (!irte) - return -1; - - for (i = 0; i < MAX_HPET_TBS; i++) { - if (ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; - break; - } - } - - if (sid == 0) { - pr_warning("Failed to set source-id of HPET block (%d)\n", id); - return -1; - } - - /* - * Should really use SQ_ALL_16. Some platforms are broken. - * While we figure out the right quirks for these broken platforms, use - * SQ_13_IGNORE_3 for now. - */ - set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid); - - return 0; -} - -static int set_msi_sid(struct irte *irte, struct pci_dev *dev) -{ - struct pci_dev *bridge; - - if (!irte || !dev) - return -1; - - /* PCIe device or Root Complex integrated PCI device */ - if (pci_is_pcie(dev) || !dev->bus->parent) { - set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, - (dev->bus->number << 8) | dev->devfn); - return 0; - } - - bridge = pci_find_upstream_pcie_bridge(dev); - if (bridge) { - if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */ - set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16, - (bridge->bus->number << 8) | dev->bus->number); - else /* this is a legacy PCI bridge */ - set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, - (bridge->bus->number << 8) | bridge->devfn); - } - - return 0; -} - -static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode) -{ - u64 addr; - u32 sts; - unsigned long flags; - - addr = virt_to_phys((void *)iommu->ir_table->base); - - raw_spin_lock_irqsave(&iommu->register_lock, flags); - - dmar_writeq(iommu->reg + DMAR_IRTA_REG, - (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE); - - /* Set interrupt-remapping table pointer */ - iommu->gcmd |= DMA_GCMD_SIRTP; - writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); - - IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_IRTPS), sts); - raw_spin_unlock_irqrestore(&iommu->register_lock, flags); - - /* - * global invalidation of interrupt entry cache before enabling - * interrupt-remapping. - */ - qi_global_iec(iommu); - - raw_spin_lock_irqsave(&iommu->register_lock, flags); - - /* Enable interrupt-remapping */ - iommu->gcmd |= DMA_GCMD_IRE; - writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); - - IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_IRES), sts); - - raw_spin_unlock_irqrestore(&iommu->register_lock, flags); -} - - -static int intel_setup_irq_remapping(struct intel_iommu *iommu, int mode) -{ - struct ir_table *ir_table; - struct page *pages; - - ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table), - GFP_ATOMIC); - - if (!iommu->ir_table) - return -ENOMEM; - - pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, - INTR_REMAP_PAGE_ORDER); - - if (!pages) { - printk(KERN_ERR "failed to allocate pages of order %d\n", - INTR_REMAP_PAGE_ORDER); - kfree(iommu->ir_table); - return -ENOMEM; - } - - ir_table->base = page_address(pages); - - iommu_set_irq_remapping(iommu, mode); - return 0; -} - -/* - * Disable Interrupt Remapping. - */ -static void iommu_disable_irq_remapping(struct intel_iommu *iommu) -{ - unsigned long flags; - u32 sts; - - if (!ecap_ir_support(iommu->ecap)) - return; - - /* - * global invalidation of interrupt entry cache before disabling - * interrupt-remapping. - */ - qi_global_iec(iommu); - - raw_spin_lock_irqsave(&iommu->register_lock, flags); - - sts = dmar_readq(iommu->reg + DMAR_GSTS_REG); - if (!(sts & DMA_GSTS_IRES)) - goto end; - - iommu->gcmd &= ~DMA_GCMD_IRE; - writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); - - IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, !(sts & DMA_GSTS_IRES), sts); - -end: - raw_spin_unlock_irqrestore(&iommu->register_lock, flags); -} - -static int __init dmar_x2apic_optout(void) -{ - struct acpi_table_dmar *dmar; - dmar = (struct acpi_table_dmar *)dmar_tbl; - if (!dmar || no_x2apic_optout) - return 0; - return dmar->flags & DMAR_X2APIC_OPT_OUT; -} - -static int __init intel_irq_remapping_supported(void) -{ - struct dmar_drhd_unit *drhd; - - if (disable_irq_remap) - return 0; - - if (!dmar_ir_support()) - return 0; - - for_each_drhd_unit(drhd) { - struct intel_iommu *iommu = drhd->iommu; - - if (!ecap_ir_support(iommu->ecap)) - return 0; - } - - return 1; -} - -static int __init intel_enable_irq_remapping(void) -{ - struct dmar_drhd_unit *drhd; - int setup = 0; - int eim = 0; - - if (parse_ioapics_under_ir() != 1) { - printk(KERN_INFO "Not enable interrupt remapping\n"); - return -1; - } - - if (x2apic_supported()) { - eim = !dmar_x2apic_optout(); - WARN(!eim, KERN_WARNING - "Your BIOS is broken and requested that x2apic be disabled\n" - "This will leave your machine vulnerable to irq-injection attacks\n" - "Use 'intremap=no_x2apic_optout' to override BIOS request\n"); - } - - for_each_drhd_unit(drhd) { - struct intel_iommu *iommu = drhd->iommu; - - /* - * If the queued invalidation is already initialized, - * shouldn't disable it. - */ - if (iommu->qi) - continue; - - /* - * Clear previous faults. - */ - dmar_fault(-1, iommu); - - /* - * Disable intr remapping and queued invalidation, if already - * enabled prior to OS handover. - */ - iommu_disable_irq_remapping(iommu); - - dmar_disable_qi(iommu); - } - - /* - * check for the Interrupt-remapping support - */ - for_each_drhd_unit(drhd) { - struct intel_iommu *iommu = drhd->iommu; - - if (!ecap_ir_support(iommu->ecap)) - continue; - - if (eim && !ecap_eim_support(iommu->ecap)) { - printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, " - " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap); - return -1; - } - } - - /* - * Enable queued invalidation for all the DRHD's. - */ - for_each_drhd_unit(drhd) { - int ret; - struct intel_iommu *iommu = drhd->iommu; - ret = dmar_enable_qi(iommu); - - if (ret) { - printk(KERN_ERR "DRHD %Lx: failed to enable queued, " - " invalidation, ecap %Lx, ret %d\n", - drhd->reg_base_addr, iommu->ecap, ret); - return -1; - } - } - - /* - * Setup Interrupt-remapping for all the DRHD's now. - */ - for_each_drhd_unit(drhd) { - struct intel_iommu *iommu = drhd->iommu; - - if (!ecap_ir_support(iommu->ecap)) - continue; - - if (intel_setup_irq_remapping(iommu, eim)) - goto error; - - setup = 1; - } - - if (!setup) - goto error; - - irq_remapping_enabled = 1; - pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic"); - - return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE; - -error: - /* - * handle error condition gracefully here! - */ - return -1; -} - -static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope, - struct intel_iommu *iommu) -{ - struct acpi_dmar_pci_path *path; - u8 bus; - int count; - - bus = scope->bus; - path = (struct acpi_dmar_pci_path *)(scope + 1); - count = (scope->length - sizeof(struct acpi_dmar_device_scope)) - / sizeof(struct acpi_dmar_pci_path); - - while (--count > 0) { - /* - * Access PCI directly due to the PCI - * subsystem isn't initialized yet. - */ - bus = read_pci_config_byte(bus, path->dev, path->fn, - PCI_SECONDARY_BUS); - path++; - } - ir_hpet[ir_hpet_num].bus = bus; - ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn); - ir_hpet[ir_hpet_num].iommu = iommu; - ir_hpet[ir_hpet_num].id = scope->enumeration_id; - ir_hpet_num++; -} - -static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope, - struct intel_iommu *iommu) -{ - struct acpi_dmar_pci_path *path; - u8 bus; - int count; - - bus = scope->bus; - path = (struct acpi_dmar_pci_path *)(scope + 1); - count = (scope->length - sizeof(struct acpi_dmar_device_scope)) - / sizeof(struct acpi_dmar_pci_path); - - while (--count > 0) { - /* - * Access PCI directly due to the PCI - * subsystem isn't initialized yet. - */ - bus = read_pci_config_byte(bus, path->dev, path->fn, - PCI_SECONDARY_BUS); - path++; - } - - ir_ioapic[ir_ioapic_num].bus = bus; - ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn); - ir_ioapic[ir_ioapic_num].iommu = iommu; - ir_ioapic[ir_ioapic_num].id = scope->enumeration_id; - ir_ioapic_num++; -} - -static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header, - struct intel_iommu *iommu) -{ - struct acpi_dmar_hardware_unit *drhd; - struct acpi_dmar_device_scope *scope; - void *start, *end; - - drhd = (struct acpi_dmar_hardware_unit *)header; - - start = (void *)(drhd + 1); - end = ((void *)drhd) + header->length; - - while (start < end) { - scope = start; - if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) { - if (ir_ioapic_num == MAX_IO_APICS) { - printk(KERN_WARNING "Exceeded Max IO APICS\n"); - return -1; - } - - printk(KERN_INFO "IOAPIC id %d under DRHD base " - " 0x%Lx IOMMU %d\n", scope->enumeration_id, - drhd->address, iommu->seq_id); - - ir_parse_one_ioapic_scope(scope, iommu); - } else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) { - if (ir_hpet_num == MAX_HPET_TBS) { - printk(KERN_WARNING "Exceeded Max HPET blocks\n"); - return -1; - } - - printk(KERN_INFO "HPET id %d under DRHD base" - " 0x%Lx\n", scope->enumeration_id, - drhd->address); - - ir_parse_one_hpet_scope(scope, iommu); - } - start += scope->length; - } - - return 0; -} - -/* - * Finds the assocaition between IOAPIC's and its Interrupt-remapping - * hardware unit. - */ -int __init parse_ioapics_under_ir(void) -{ - struct dmar_drhd_unit *drhd; - int ir_supported = 0; - - for_each_drhd_unit(drhd) { - struct intel_iommu *iommu = drhd->iommu; - - if (ecap_ir_support(iommu->ecap)) { - if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu)) - return -1; - - ir_supported = 1; - } - } - - if (ir_supported && ir_ioapic_num != nr_ioapics) { - printk(KERN_WARNING - "Not all IO-APIC's listed under remapping hardware\n"); - return -1; - } - - return ir_supported; -} - -int __init ir_dev_scope_init(void) -{ - if (!irq_remapping_enabled) - return 0; - - return dmar_dev_scope_init(); -} -rootfs_initcall(ir_dev_scope_init); - -static void disable_irq_remapping(void) -{ - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu = NULL; - - /* - * Disable Interrupt-remapping for all the DRHD's now. - */ - for_each_iommu(iommu, drhd) { - if (!ecap_ir_support(iommu->ecap)) - continue; - - iommu_disable_irq_remapping(iommu); - } -} - -static int reenable_irq_remapping(int eim) -{ - struct dmar_drhd_unit *drhd; - int setup = 0; - struct intel_iommu *iommu = NULL; - - for_each_iommu(iommu, drhd) - if (iommu->qi) - dmar_reenable_qi(iommu); - - /* - * Setup Interrupt-remapping for all the DRHD's now. - */ - for_each_iommu(iommu, drhd) { - if (!ecap_ir_support(iommu->ecap)) - continue; - - /* Set up interrupt remapping for iommu.*/ - iommu_set_irq_remapping(iommu, eim); - setup = 1; - } - - if (!setup) - goto error; - - return 0; - -error: - /* - * handle error condition gracefully here! - */ - return -1; -} - -static void prepare_irte(struct irte *irte, int vector, - unsigned int dest) -{ - memset(irte, 0, sizeof(*irte)); - - irte->present = 1; - irte->dst_mode = apic->irq_dest_mode; - /* - * Trigger mode in the IRTE will always be edge, and for IO-APIC, the - * actual level or edge trigger will be setup in the IO-APIC - * RTE. This will help simplify level triggered irq migration. - * For more details, see the comments (in io_apic.c) explainig IO-APIC - * irq migration in the presence of interrupt-remapping. - */ - irte->trigger_mode = 0; - irte->dlvry_mode = apic->irq_delivery_mode; - irte->vector = vector; - irte->dest_id = IRTE_DEST(dest); - irte->redir_hint = 1; -} - -static int intel_setup_ioapic_entry(int irq, - struct IO_APIC_route_entry *route_entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - int ioapic_id = mpc_ioapic_id(attr->ioapic); - struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); - struct IR_IO_APIC_route_entry *entry; - struct irte irte; - int index; - - if (!iommu) { - pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); - return -ENODEV; - } - - entry = (struct IR_IO_APIC_route_entry *)route_entry; - - index = alloc_irte(iommu, irq, 1); - if (index < 0) { - pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); - return -ENOMEM; - } - - prepare_irte(&irte, vector, destination); - - /* Set source-id of interrupt request */ - set_ioapic_sid(&irte, ioapic_id); - - modify_irte(irq, &irte); - - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " - "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " - "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " - "Avail:%X Vector:%02X Dest:%08X " - "SID:%04X SQ:%X SVT:%X)\n", - attr->ioapic, irte.present, irte.fpd, irte.dst_mode, - irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, - irte.avail, irte.vector, irte.dest_id, - irte.sid, irte.sq, irte.svt); - - memset(entry, 0, sizeof(*entry)); - - entry->index2 = (index >> 15) & 0x1; - entry->zero = 0; - entry->format = 1; - entry->index = (index & 0x7fff); - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - entry->vector = attr->ioapic_pin; - entry->mask = 0; /* enable IRQ */ - entry->trigger = attr->trigger; - entry->polarity = attr->polarity; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (attr->trigger) - entry->mask = 1; - - return 0; -} - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For both level and edge triggered, irq migration is a simple atomic - * update(of vector and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we eliminate the io-apic RTE modification (with the - * updated vector information), by using a virtual vector (io-apic pin number). - * Real vector that is used for interrupting cpu will be coming from - * the interrupt-remapping table entry. - * - * As the migration is a simple atomic update of IRTE, the same mechanism - * is used to migrate MSI irq's in the presence of interrupt-remapping. - */ -static int -intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest, irq = data->irq; - struct irte irte; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - if (get_irte(irq, &irte)) - return -EBUSY; - - if (assign_irq_vector(irq, cfg, mask)) - return -EBUSY; - - dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Atomically updates the IRTE with the new destination, vector - * and flushes the interrupt entry cache. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - cpumask_copy(data->affinity, mask); - return 0; -} - -static void intel_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg; - struct irte irte; - u16 sub_handle; - int ir_index; - - cfg = irq_get_chip_data(irq); - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - prepare_irte(&irte, cfg->vector, dest); - - /* Set source-id of interrupt request */ - if (pdev) - set_msi_sid(&irte, pdev); - else - set_hpet_sid(&irte, hpet_id); - - modify_irte(irq, &irte); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); -} - -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} - -static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) -{ - struct intel_iommu *iommu; - - iommu = map_dev_to_ir(pdev); - if (!iommu) - return -ENOENT; - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); - - return 0; -} - -static int intel_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - struct intel_iommu *iommu = map_hpet_to_ir(id); - int index; - - if (!iommu) - return -1; - - index = alloc_irte(iommu, irq, 1); - if (index < 0) - return -1; - - return 0; -} - -struct irq_remap_ops intel_irq_remap_ops = { - .supported = intel_irq_remapping_supported, - .prepare = dmar_table_init, - .enable = intel_enable_irq_remapping, - .disable = disable_irq_remapping, - .reenable = reenable_irq_remapping, - .enable_faulting = enable_drhd_fault_handling, - .setup_ioapic_entry = intel_setup_ioapic_entry, - .set_affinity = intel_ioapic_set_affinity, - .free_irq = free_irte, - .compose_msi_msg = intel_compose_msi_msg, - .msi_alloc_irq = intel_msi_alloc_irq, - .msi_setup_irq = intel_msi_setup_irq, - .setup_hpet_msi = intel_setup_hpet_msi, -}; diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c new file mode 100644 index 000000000000..b4d39507681a --- /dev/null +++ b/drivers/iommu/intel_irq_remapping.c @@ -0,0 +1,1065 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "irq_remapping.h" + +struct ioapic_scope { + struct intel_iommu *iommu; + unsigned int id; + unsigned int bus; /* PCI bus number */ + unsigned int devfn; /* PCI devfn number */ +}; + +struct hpet_scope { + struct intel_iommu *iommu; + u8 id; + unsigned int bus; + unsigned int devfn; +}; + +#define IR_X2APIC_MODE(mode) (mode ? (1 << 11) : 0) +#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) + +static struct ioapic_scope ir_ioapic[MAX_IO_APICS]; +static struct hpet_scope ir_hpet[MAX_HPET_TBS]; +static int ir_ioapic_num, ir_hpet_num; + +static DEFINE_RAW_SPINLOCK(irq_2_ir_lock); + +static struct irq_2_iommu *irq_2_iommu(unsigned int irq) +{ + struct irq_cfg *cfg = irq_get_chip_data(irq); + return cfg ? &cfg->irq_2_iommu : NULL; +} + +int get_irte(int irq, struct irte *entry) +{ + struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); + unsigned long flags; + int index; + + if (!entry || !irq_iommu) + return -1; + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + + index = irq_iommu->irte_index + irq_iommu->sub_handle; + *entry = *(irq_iommu->iommu->ir_table->base + index); + + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return 0; +} + +static int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) +{ + struct ir_table *table = iommu->ir_table; + struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); + u16 index, start_index; + unsigned int mask = 0; + unsigned long flags; + int i; + + if (!count || !irq_iommu) + return -1; + + /* + * start the IRTE search from index 0. + */ + index = start_index = 0; + + if (count > 1) { + count = __roundup_pow_of_two(count); + mask = ilog2(count); + } + + if (mask > ecap_max_handle_mask(iommu->ecap)) { + printk(KERN_ERR + "Requested mask %x exceeds the max invalidation handle" + " mask value %Lx\n", mask, + ecap_max_handle_mask(iommu->ecap)); + return -1; + } + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + do { + for (i = index; i < index + count; i++) + if (table->base[i].present) + break; + /* empty index found */ + if (i == index + count) + break; + + index = (index + count) % INTR_REMAP_TABLE_ENTRIES; + + if (index == start_index) { + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + printk(KERN_ERR "can't allocate an IRTE\n"); + return -1; + } + } while (1); + + for (i = index; i < index + count; i++) + table->base[i].present = 1; + + irq_iommu->iommu = iommu; + irq_iommu->irte_index = index; + irq_iommu->sub_handle = 0; + irq_iommu->irte_mask = mask; + + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return index; +} + +static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask) +{ + struct qi_desc desc; + + desc.low = QI_IEC_IIDEX(index) | QI_IEC_TYPE | QI_IEC_IM(mask) + | QI_IEC_SELECTIVE; + desc.high = 0; + + return qi_submit_sync(&desc, iommu); +} + +static int map_irq_to_irte_handle(int irq, u16 *sub_handle) +{ + struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); + unsigned long flags; + int index; + + if (!irq_iommu) + return -1; + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + *sub_handle = irq_iommu->sub_handle; + index = irq_iommu->irte_index; + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + return index; +} + +static int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) +{ + struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); + unsigned long flags; + + if (!irq_iommu) + return -1; + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + + irq_iommu->iommu = iommu; + irq_iommu->irte_index = index; + irq_iommu->sub_handle = subhandle; + irq_iommu->irte_mask = 0; + + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return 0; +} + +static int modify_irte(int irq, struct irte *irte_modified) +{ + struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); + struct intel_iommu *iommu; + unsigned long flags; + struct irte *irte; + int rc, index; + + if (!irq_iommu) + return -1; + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + + iommu = irq_iommu->iommu; + + index = irq_iommu->irte_index + irq_iommu->sub_handle; + irte = &iommu->ir_table->base[index]; + + set_64bit(&irte->low, irte_modified->low); + set_64bit(&irte->high, irte_modified->high); + __iommu_flush_cache(iommu, irte, sizeof(*irte)); + + rc = qi_flush_iec(iommu, index, 0); + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return rc; +} + +static struct intel_iommu *map_hpet_to_ir(u8 hpet_id) +{ + int i; + + for (i = 0; i < MAX_HPET_TBS; i++) + if (ir_hpet[i].id == hpet_id) + return ir_hpet[i].iommu; + return NULL; +} + +static struct intel_iommu *map_ioapic_to_ir(int apic) +{ + int i; + + for (i = 0; i < MAX_IO_APICS; i++) + if (ir_ioapic[i].id == apic) + return ir_ioapic[i].iommu; + return NULL; +} + +static struct intel_iommu *map_dev_to_ir(struct pci_dev *dev) +{ + struct dmar_drhd_unit *drhd; + + drhd = dmar_find_matched_drhd_unit(dev); + if (!drhd) + return NULL; + + return drhd->iommu; +} + +static int clear_entries(struct irq_2_iommu *irq_iommu) +{ + struct irte *start, *entry, *end; + struct intel_iommu *iommu; + int index; + + if (irq_iommu->sub_handle) + return 0; + + iommu = irq_iommu->iommu; + index = irq_iommu->irte_index + irq_iommu->sub_handle; + + start = iommu->ir_table->base + index; + end = start + (1 << irq_iommu->irte_mask); + + for (entry = start; entry < end; entry++) { + set_64bit(&entry->low, 0); + set_64bit(&entry->high, 0); + } + + return qi_flush_iec(iommu, index, irq_iommu->irte_mask); +} + +static int free_irte(int irq) +{ + struct irq_2_iommu *irq_iommu = irq_2_iommu(irq); + unsigned long flags; + int rc; + + if (!irq_iommu) + return -1; + + raw_spin_lock_irqsave(&irq_2_ir_lock, flags); + + rc = clear_entries(irq_iommu); + + irq_iommu->iommu = NULL; + irq_iommu->irte_index = 0; + irq_iommu->sub_handle = 0; + irq_iommu->irte_mask = 0; + + raw_spin_unlock_irqrestore(&irq_2_ir_lock, flags); + + return rc; +} + +/* + * source validation type + */ +#define SVT_NO_VERIFY 0x0 /* no verification is required */ +#define SVT_VERIFY_SID_SQ 0x1 /* verify using SID and SQ fields */ +#define SVT_VERIFY_BUS 0x2 /* verify bus of request-id */ + +/* + * source-id qualifier + */ +#define SQ_ALL_16 0x0 /* verify all 16 bits of request-id */ +#define SQ_13_IGNORE_1 0x1 /* verify most significant 13 bits, ignore + * the third least significant bit + */ +#define SQ_13_IGNORE_2 0x2 /* verify most significant 13 bits, ignore + * the second and third least significant bits + */ +#define SQ_13_IGNORE_3 0x3 /* verify most significant 13 bits, ignore + * the least three significant bits + */ + +/* + * set SVT, SQ and SID fields of irte to verify + * source ids of interrupt requests + */ +static void set_irte_sid(struct irte *irte, unsigned int svt, + unsigned int sq, unsigned int sid) +{ + if (disable_sourceid_checking) + svt = SVT_NO_VERIFY; + irte->svt = svt; + irte->sq = sq; + irte->sid = sid; +} + +static int set_ioapic_sid(struct irte *irte, int apic) +{ + int i; + u16 sid = 0; + + if (!irte) + return -1; + + for (i = 0; i < MAX_IO_APICS; i++) { + if (ir_ioapic[i].id == apic) { + sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + break; + } + } + + if (sid == 0) { + pr_warning("Failed to set source-id of IOAPIC (%d)\n", apic); + return -1; + } + + set_irte_sid(irte, 1, 0, sid); + + return 0; +} + +static int set_hpet_sid(struct irte *irte, u8 id) +{ + int i; + u16 sid = 0; + + if (!irte) + return -1; + + for (i = 0; i < MAX_HPET_TBS; i++) { + if (ir_hpet[i].id == id) { + sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + break; + } + } + + if (sid == 0) { + pr_warning("Failed to set source-id of HPET block (%d)\n", id); + return -1; + } + + /* + * Should really use SQ_ALL_16. Some platforms are broken. + * While we figure out the right quirks for these broken platforms, use + * SQ_13_IGNORE_3 for now. + */ + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_13_IGNORE_3, sid); + + return 0; +} + +static int set_msi_sid(struct irte *irte, struct pci_dev *dev) +{ + struct pci_dev *bridge; + + if (!irte || !dev) + return -1; + + /* PCIe device or Root Complex integrated PCI device */ + if (pci_is_pcie(dev) || !dev->bus->parent) { + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, + (dev->bus->number << 8) | dev->devfn); + return 0; + } + + bridge = pci_find_upstream_pcie_bridge(dev); + if (bridge) { + if (pci_is_pcie(bridge))/* this is a PCIe-to-PCI/PCIX bridge */ + set_irte_sid(irte, SVT_VERIFY_BUS, SQ_ALL_16, + (bridge->bus->number << 8) | dev->bus->number); + else /* this is a legacy PCI bridge */ + set_irte_sid(irte, SVT_VERIFY_SID_SQ, SQ_ALL_16, + (bridge->bus->number << 8) | bridge->devfn); + } + + return 0; +} + +static void iommu_set_irq_remapping(struct intel_iommu *iommu, int mode) +{ + u64 addr; + u32 sts; + unsigned long flags; + + addr = virt_to_phys((void *)iommu->ir_table->base); + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + dmar_writeq(iommu->reg + DMAR_IRTA_REG, + (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE); + + /* Set interrupt-remapping table pointer */ + iommu->gcmd |= DMA_GCMD_SIRTP; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_IRTPS), sts); + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); + + /* + * global invalidation of interrupt entry cache before enabling + * interrupt-remapping. + */ + qi_global_iec(iommu); + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + /* Enable interrupt-remapping */ + iommu->gcmd |= DMA_GCMD_IRE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, (sts & DMA_GSTS_IRES), sts); + + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + + +static int intel_setup_irq_remapping(struct intel_iommu *iommu, int mode) +{ + struct ir_table *ir_table; + struct page *pages; + + ir_table = iommu->ir_table = kzalloc(sizeof(struct ir_table), + GFP_ATOMIC); + + if (!iommu->ir_table) + return -ENOMEM; + + pages = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO, + INTR_REMAP_PAGE_ORDER); + + if (!pages) { + printk(KERN_ERR "failed to allocate pages of order %d\n", + INTR_REMAP_PAGE_ORDER); + kfree(iommu->ir_table); + return -ENOMEM; + } + + ir_table->base = page_address(pages); + + iommu_set_irq_remapping(iommu, mode); + return 0; +} + +/* + * Disable Interrupt Remapping. + */ +static void iommu_disable_irq_remapping(struct intel_iommu *iommu) +{ + unsigned long flags; + u32 sts; + + if (!ecap_ir_support(iommu->ecap)) + return; + + /* + * global invalidation of interrupt entry cache before disabling + * interrupt-remapping. + */ + qi_global_iec(iommu); + + raw_spin_lock_irqsave(&iommu->register_lock, flags); + + sts = dmar_readq(iommu->reg + DMAR_GSTS_REG); + if (!(sts & DMA_GSTS_IRES)) + goto end; + + iommu->gcmd &= ~DMA_GCMD_IRE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); + + IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, + readl, !(sts & DMA_GSTS_IRES), sts); + +end: + raw_spin_unlock_irqrestore(&iommu->register_lock, flags); +} + +static int __init dmar_x2apic_optout(void) +{ + struct acpi_table_dmar *dmar; + dmar = (struct acpi_table_dmar *)dmar_tbl; + if (!dmar || no_x2apic_optout) + return 0; + return dmar->flags & DMAR_X2APIC_OPT_OUT; +} + +static int __init intel_irq_remapping_supported(void) +{ + struct dmar_drhd_unit *drhd; + + if (disable_irq_remap) + return 0; + + if (!dmar_ir_support()) + return 0; + + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; + + if (!ecap_ir_support(iommu->ecap)) + return 0; + } + + return 1; +} + +static int __init intel_enable_irq_remapping(void) +{ + struct dmar_drhd_unit *drhd; + int setup = 0; + int eim = 0; + + if (parse_ioapics_under_ir() != 1) { + printk(KERN_INFO "Not enable interrupt remapping\n"); + return -1; + } + + if (x2apic_supported()) { + eim = !dmar_x2apic_optout(); + WARN(!eim, KERN_WARNING + "Your BIOS is broken and requested that x2apic be disabled\n" + "This will leave your machine vulnerable to irq-injection attacks\n" + "Use 'intremap=no_x2apic_optout' to override BIOS request\n"); + } + + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; + + /* + * If the queued invalidation is already initialized, + * shouldn't disable it. + */ + if (iommu->qi) + continue; + + /* + * Clear previous faults. + */ + dmar_fault(-1, iommu); + + /* + * Disable intr remapping and queued invalidation, if already + * enabled prior to OS handover. + */ + iommu_disable_irq_remapping(iommu); + + dmar_disable_qi(iommu); + } + + /* + * check for the Interrupt-remapping support + */ + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; + + if (!ecap_ir_support(iommu->ecap)) + continue; + + if (eim && !ecap_eim_support(iommu->ecap)) { + printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, " + " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap); + return -1; + } + } + + /* + * Enable queued invalidation for all the DRHD's. + */ + for_each_drhd_unit(drhd) { + int ret; + struct intel_iommu *iommu = drhd->iommu; + ret = dmar_enable_qi(iommu); + + if (ret) { + printk(KERN_ERR "DRHD %Lx: failed to enable queued, " + " invalidation, ecap %Lx, ret %d\n", + drhd->reg_base_addr, iommu->ecap, ret); + return -1; + } + } + + /* + * Setup Interrupt-remapping for all the DRHD's now. + */ + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; + + if (!ecap_ir_support(iommu->ecap)) + continue; + + if (intel_setup_irq_remapping(iommu, eim)) + goto error; + + setup = 1; + } + + if (!setup) + goto error; + + irq_remapping_enabled = 1; + pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic"); + + return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE; + +error: + /* + * handle error condition gracefully here! + */ + return -1; +} + +static void ir_parse_one_hpet_scope(struct acpi_dmar_device_scope *scope, + struct intel_iommu *iommu) +{ + struct acpi_dmar_pci_path *path; + u8 bus; + int count; + + bus = scope->bus; + path = (struct acpi_dmar_pci_path *)(scope + 1); + count = (scope->length - sizeof(struct acpi_dmar_device_scope)) + / sizeof(struct acpi_dmar_pci_path); + + while (--count > 0) { + /* + * Access PCI directly due to the PCI + * subsystem isn't initialized yet. + */ + bus = read_pci_config_byte(bus, path->dev, path->fn, + PCI_SECONDARY_BUS); + path++; + } + ir_hpet[ir_hpet_num].bus = bus; + ir_hpet[ir_hpet_num].devfn = PCI_DEVFN(path->dev, path->fn); + ir_hpet[ir_hpet_num].iommu = iommu; + ir_hpet[ir_hpet_num].id = scope->enumeration_id; + ir_hpet_num++; +} + +static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope, + struct intel_iommu *iommu) +{ + struct acpi_dmar_pci_path *path; + u8 bus; + int count; + + bus = scope->bus; + path = (struct acpi_dmar_pci_path *)(scope + 1); + count = (scope->length - sizeof(struct acpi_dmar_device_scope)) + / sizeof(struct acpi_dmar_pci_path); + + while (--count > 0) { + /* + * Access PCI directly due to the PCI + * subsystem isn't initialized yet. + */ + bus = read_pci_config_byte(bus, path->dev, path->fn, + PCI_SECONDARY_BUS); + path++; + } + + ir_ioapic[ir_ioapic_num].bus = bus; + ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->dev, path->fn); + ir_ioapic[ir_ioapic_num].iommu = iommu; + ir_ioapic[ir_ioapic_num].id = scope->enumeration_id; + ir_ioapic_num++; +} + +static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header, + struct intel_iommu *iommu) +{ + struct acpi_dmar_hardware_unit *drhd; + struct acpi_dmar_device_scope *scope; + void *start, *end; + + drhd = (struct acpi_dmar_hardware_unit *)header; + + start = (void *)(drhd + 1); + end = ((void *)drhd) + header->length; + + while (start < end) { + scope = start; + if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) { + if (ir_ioapic_num == MAX_IO_APICS) { + printk(KERN_WARNING "Exceeded Max IO APICS\n"); + return -1; + } + + printk(KERN_INFO "IOAPIC id %d under DRHD base " + " 0x%Lx IOMMU %d\n", scope->enumeration_id, + drhd->address, iommu->seq_id); + + ir_parse_one_ioapic_scope(scope, iommu); + } else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) { + if (ir_hpet_num == MAX_HPET_TBS) { + printk(KERN_WARNING "Exceeded Max HPET blocks\n"); + return -1; + } + + printk(KERN_INFO "HPET id %d under DRHD base" + " 0x%Lx\n", scope->enumeration_id, + drhd->address); + + ir_parse_one_hpet_scope(scope, iommu); + } + start += scope->length; + } + + return 0; +} + +/* + * Finds the assocaition between IOAPIC's and its Interrupt-remapping + * hardware unit. + */ +int __init parse_ioapics_under_ir(void) +{ + struct dmar_drhd_unit *drhd; + int ir_supported = 0; + + for_each_drhd_unit(drhd) { + struct intel_iommu *iommu = drhd->iommu; + + if (ecap_ir_support(iommu->ecap)) { + if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu)) + return -1; + + ir_supported = 1; + } + } + + if (ir_supported && ir_ioapic_num != nr_ioapics) { + printk(KERN_WARNING + "Not all IO-APIC's listed under remapping hardware\n"); + return -1; + } + + return ir_supported; +} + +int __init ir_dev_scope_init(void) +{ + if (!irq_remapping_enabled) + return 0; + + return dmar_dev_scope_init(); +} +rootfs_initcall(ir_dev_scope_init); + +static void disable_irq_remapping(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu = NULL; + + /* + * Disable Interrupt-remapping for all the DRHD's now. + */ + for_each_iommu(iommu, drhd) { + if (!ecap_ir_support(iommu->ecap)) + continue; + + iommu_disable_irq_remapping(iommu); + } +} + +static int reenable_irq_remapping(int eim) +{ + struct dmar_drhd_unit *drhd; + int setup = 0; + struct intel_iommu *iommu = NULL; + + for_each_iommu(iommu, drhd) + if (iommu->qi) + dmar_reenable_qi(iommu); + + /* + * Setup Interrupt-remapping for all the DRHD's now. + */ + for_each_iommu(iommu, drhd) { + if (!ecap_ir_support(iommu->ecap)) + continue; + + /* Set up interrupt remapping for iommu.*/ + iommu_set_irq_remapping(iommu, eim); + setup = 1; + } + + if (!setup) + goto error; + + return 0; + +error: + /* + * handle error condition gracefully here! + */ + return -1; +} + +static void prepare_irte(struct irte *irte, int vector, + unsigned int dest) +{ + memset(irte, 0, sizeof(*irte)); + + irte->present = 1; + irte->dst_mode = apic->irq_dest_mode; + /* + * Trigger mode in the IRTE will always be edge, and for IO-APIC, the + * actual level or edge trigger will be setup in the IO-APIC + * RTE. This will help simplify level triggered irq migration. + * For more details, see the comments (in io_apic.c) explainig IO-APIC + * irq migration in the presence of interrupt-remapping. + */ + irte->trigger_mode = 0; + irte->dlvry_mode = apic->irq_delivery_mode; + irte->vector = vector; + irte->dest_id = IRTE_DEST(dest); + irte->redir_hint = 1; +} + +static int intel_setup_ioapic_entry(int irq, + struct IO_APIC_route_entry *route_entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + int ioapic_id = mpc_ioapic_id(attr->ioapic); + struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); + struct IR_IO_APIC_route_entry *entry; + struct irte irte; + int index; + + if (!iommu) { + pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); + return -ENODEV; + } + + entry = (struct IR_IO_APIC_route_entry *)route_entry; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) { + pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); + return -ENOMEM; + } + + prepare_irte(&irte, vector, destination); + + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, ioapic_id); + + modify_irte(irq, &irte); + + apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " + "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " + "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " + "Avail:%X Vector:%02X Dest:%08X " + "SID:%04X SQ:%X SVT:%X)\n", + attr->ioapic, irte.present, irte.fpd, irte.dst_mode, + irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, + irte.avail, irte.vector, irte.dest_id, + irte.sid, irte.sq, irte.svt); + + memset(entry, 0, sizeof(*entry)); + + entry->index2 = (index >> 15) & 0x1; + entry->zero = 0; + entry->format = 1; + entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + entry->vector = attr->ioapic_pin; + entry->mask = 0; /* enable IRQ */ + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (attr->trigger) + entry->mask = 1; + + return 0; +} + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. + * + * As the migration is a simple atomic update of IRTE, the same mechanism + * is used to migrate MSI irq's in the presence of interrupt-remapping. + */ +static int +intel_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; + struct irte irte; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + if (get_irte(irq, &irte)) + return -EBUSY; + + if (assign_irq_vector(irq, cfg, mask)) + return -EBUSY; + + dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + cpumask_copy(data->affinity, mask); + return 0; +} + +static void intel_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg; + struct irte irte; + u16 sub_handle; + int ir_index; + + cfg = irq_get_chip_data(irq); + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + prepare_irte(&irte, cfg->vector, dest); + + /* Set source-id of interrupt request */ + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); +} + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int intel_msi_alloc_irq(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} + +static int intel_msi_setup_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) +{ + struct intel_iommu *iommu; + + iommu = map_dev_to_ir(pdev); + if (!iommu) + return -ENOENT; + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + + return 0; +} + +static int intel_setup_hpet_msi(unsigned int irq, unsigned int id) +{ + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + + return 0; +} + +struct irq_remap_ops intel_irq_remap_ops = { + .supported = intel_irq_remapping_supported, + .prepare = dmar_table_init, + .enable = intel_enable_irq_remapping, + .disable = disable_irq_remapping, + .reenable = reenable_irq_remapping, + .enable_faulting = enable_drhd_fault_handling, + .setup_ioapic_entry = intel_setup_ioapic_entry, + .set_affinity = intel_ioapic_set_affinity, + .free_irq = free_irte, + .compose_msi_msg = intel_compose_msi_msg, + .msi_alloc_irq = intel_msi_alloc_irq, + .msi_setup_irq = intel_msi_setup_irq, + .setup_hpet_msi = intel_setup_hpet_msi, +}; diff --git a/drivers/iommu/intr_remapping.c b/drivers/iommu/intr_remapping.c deleted file mode 100644 index 523a7b3a1205..000000000000 --- a/drivers/iommu/intr_remapping.c +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include - -#include "intr_remapping.h" - -int irq_remapping_enabled; - -int disable_irq_remap; -int disable_sourceid_checking; -int no_x2apic_optout; - -static struct irq_remap_ops *remap_ops; - -static __init int setup_nointremap(char *str) -{ - disable_irq_remap = 1; - return 0; -} -early_param("nointremap", setup_nointremap); - -static __init int setup_irqremap(char *str) -{ - if (!str) - return -EINVAL; - - while (*str) { - if (!strncmp(str, "on", 2)) - disable_irq_remap = 0; - else if (!strncmp(str, "off", 3)) - disable_irq_remap = 1; - else if (!strncmp(str, "nosid", 5)) - disable_sourceid_checking = 1; - else if (!strncmp(str, "no_x2apic_optout", 16)) - no_x2apic_optout = 1; - - str += strcspn(str, ","); - while (*str == ',') - str++; - } - - return 0; -} -early_param("intremap", setup_irqremap); - -void __init setup_irq_remapping_ops(void) -{ - remap_ops = &intel_irq_remap_ops; -} - -int irq_remapping_supported(void) -{ - if (disable_irq_remap) - return 0; - - if (!remap_ops || !remap_ops->supported) - return 0; - - return remap_ops->supported(); -} - -int __init irq_remapping_prepare(void) -{ - if (!remap_ops || !remap_ops->prepare) - return -ENODEV; - - return remap_ops->prepare(); -} - -int __init irq_remapping_enable(void) -{ - if (!remap_ops || !remap_ops->enable) - return -ENODEV; - - return remap_ops->enable(); -} - -void irq_remapping_disable(void) -{ - if (!remap_ops || !remap_ops->disable) - return; - - remap_ops->disable(); -} - -int irq_remapping_reenable(int mode) -{ - if (!remap_ops || !remap_ops->reenable) - return 0; - - return remap_ops->reenable(mode); -} - -int __init irq_remap_enable_fault_handling(void) -{ - if (!remap_ops || !remap_ops->enable_faulting) - return -ENODEV; - - return remap_ops->enable_faulting(); -} - -int setup_ioapic_remapped_entry(int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int vector, - struct io_apic_irq_attr *attr) -{ - if (!remap_ops || !remap_ops->setup_ioapic_entry) - return -ENODEV; - - return remap_ops->setup_ioapic_entry(irq, entry, destination, - vector, attr); -} - -int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - if (!remap_ops || !remap_ops->set_affinity) - return 0; - - return remap_ops->set_affinity(data, mask, force); -} - -void free_remapped_irq(int irq) -{ - if (!remap_ops || !remap_ops->free_irq) - return; - - remap_ops->free_irq(irq); -} - -void compose_remapped_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ - if (!remap_ops || !remap_ops->compose_msi_msg) - return; - - remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id); -} - -int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) -{ - if (!remap_ops || !remap_ops->msi_alloc_irq) - return -ENODEV; - - return remap_ops->msi_alloc_irq(pdev, irq, nvec); -} - -int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, - int index, int sub_handle) -{ - if (!remap_ops || !remap_ops->msi_setup_irq) - return -ENODEV; - - return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle); -} - -int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) -{ - if (!remap_ops || !remap_ops->setup_hpet_msi) - return -ENODEV; - - return remap_ops->setup_hpet_msi(irq, id); -} diff --git a/drivers/iommu/intr_remapping.h b/drivers/iommu/intr_remapping.h deleted file mode 100644 index bd5d98fec148..000000000000 --- a/drivers/iommu/intr_remapping.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (C) 2012 Advanced Micro Devices, Inc. - * Author: Joerg Roedel - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * This header file contains stuff that is shared between different interrupt - * remapping drivers but with no need to be visible outside of the IOMMU layer. - */ - -#ifndef __INTR_REMAPPING_H -#define __INTR_REMAPPING_H - -#ifdef CONFIG_IRQ_REMAP - -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct irq_data; -struct cpumask; -struct pci_dev; -struct msi_msg; - -extern int disable_irq_remap; -extern int disable_sourceid_checking; -extern int no_x2apic_optout; - -struct irq_remap_ops { - /* Check whether Interrupt Remapping is supported */ - int (*supported)(void); - - /* Initializes hardware and makes it ready for remapping interrupts */ - int (*prepare)(void); - - /* Enables the remapping hardware */ - int (*enable)(void); - - /* Disables the remapping hardware */ - void (*disable)(void); - - /* Reenables the remapping hardware */ - int (*reenable)(int); - - /* Enable fault handling */ - int (*enable_faulting)(void); - - /* IO-APIC setup routine */ - int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *, - unsigned int, int, - struct io_apic_irq_attr *); - - /* Set the CPU affinity of a remapped interrupt */ - int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, - bool force); - - /* Free an IRQ */ - int (*free_irq)(int); - - /* Create MSI msg to use for interrupt remapping */ - void (*compose_msi_msg)(struct pci_dev *, - unsigned int, unsigned int, - struct msi_msg *, u8); - - /* Allocate remapping resources for MSI */ - int (*msi_alloc_irq)(struct pci_dev *, int, int); - - /* Setup the remapped MSI irq */ - int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int); - - /* Setup interrupt remapping for an HPET MSI */ - int (*setup_hpet_msi)(unsigned int, unsigned int); -}; - -extern struct irq_remap_ops intel_irq_remap_ops; - -#endif /* CONFIG_IRQ_REMAP */ - -#endif /* __INTR_REMAPPING_H */ diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c new file mode 100644 index 000000000000..1cf350e02da8 --- /dev/null +++ b/drivers/iommu/irq_remapping.c @@ -0,0 +1,164 @@ +#include +#include +#include + +#include "irq_remapping.h" + +int irq_remapping_enabled; + +int disable_irq_remap; +int disable_sourceid_checking; +int no_x2apic_optout; + +static struct irq_remap_ops *remap_ops; + +static __init int setup_nointremap(char *str) +{ + disable_irq_remap = 1; + return 0; +} +early_param("nointremap", setup_nointremap); + +static __init int setup_irqremap(char *str) +{ + if (!str) + return -EINVAL; + + while (*str) { + if (!strncmp(str, "on", 2)) + disable_irq_remap = 0; + else if (!strncmp(str, "off", 3)) + disable_irq_remap = 1; + else if (!strncmp(str, "nosid", 5)) + disable_sourceid_checking = 1; + else if (!strncmp(str, "no_x2apic_optout", 16)) + no_x2apic_optout = 1; + + str += strcspn(str, ","); + while (*str == ',') + str++; + } + + return 0; +} +early_param("intremap", setup_irqremap); + +void __init setup_irq_remapping_ops(void) +{ + remap_ops = &intel_irq_remap_ops; +} + +int irq_remapping_supported(void) +{ + if (disable_irq_remap) + return 0; + + if (!remap_ops || !remap_ops->supported) + return 0; + + return remap_ops->supported(); +} + +int __init irq_remapping_prepare(void) +{ + if (!remap_ops || !remap_ops->prepare) + return -ENODEV; + + return remap_ops->prepare(); +} + +int __init irq_remapping_enable(void) +{ + if (!remap_ops || !remap_ops->enable) + return -ENODEV; + + return remap_ops->enable(); +} + +void irq_remapping_disable(void) +{ + if (!remap_ops || !remap_ops->disable) + return; + + remap_ops->disable(); +} + +int irq_remapping_reenable(int mode) +{ + if (!remap_ops || !remap_ops->reenable) + return 0; + + return remap_ops->reenable(mode); +} + +int __init irq_remap_enable_fault_handling(void) +{ + if (!remap_ops || !remap_ops->enable_faulting) + return -ENODEV; + + return remap_ops->enable_faulting(); +} + +int setup_ioapic_remapped_entry(int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + if (!remap_ops || !remap_ops->setup_ioapic_entry) + return -ENODEV; + + return remap_ops->setup_ioapic_entry(irq, entry, destination, + vector, attr); +} + +int set_remapped_irq_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + if (!remap_ops || !remap_ops->set_affinity) + return 0; + + return remap_ops->set_affinity(data, mask, force); +} + +void free_remapped_irq(int irq) +{ + if (!remap_ops || !remap_ops->free_irq) + return; + + remap_ops->free_irq(irq); +} + +void compose_remapped_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ + if (!remap_ops || !remap_ops->compose_msi_msg) + return; + + remap_ops->compose_msi_msg(pdev, irq, dest, msg, hpet_id); +} + +int msi_alloc_remapped_irq(struct pci_dev *pdev, int irq, int nvec) +{ + if (!remap_ops || !remap_ops->msi_alloc_irq) + return -ENODEV; + + return remap_ops->msi_alloc_irq(pdev, irq, nvec); +} + +int msi_setup_remapped_irq(struct pci_dev *pdev, unsigned int irq, + int index, int sub_handle) +{ + if (!remap_ops || !remap_ops->msi_setup_irq) + return -ENODEV; + + return remap_ops->msi_setup_irq(pdev, irq, index, sub_handle); +} + +int setup_hpet_msi_remapped(unsigned int irq, unsigned int id) +{ + if (!remap_ops || !remap_ops->setup_hpet_msi) + return -ENODEV; + + return remap_ops->setup_hpet_msi(irq, id); +} diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h new file mode 100644 index 000000000000..b12974cc1dfe --- /dev/null +++ b/drivers/iommu/irq_remapping.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2012 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * This header file contains stuff that is shared between different interrupt + * remapping drivers but with no need to be visible outside of the IOMMU layer. + */ + +#ifndef __IRQ_REMAPPING_H +#define __IRQ_REMAPPING_H + +#ifdef CONFIG_IRQ_REMAP + +struct IO_APIC_route_entry; +struct io_apic_irq_attr; +struct irq_data; +struct cpumask; +struct pci_dev; +struct msi_msg; + +extern int disable_irq_remap; +extern int disable_sourceid_checking; +extern int no_x2apic_optout; + +struct irq_remap_ops { + /* Check whether Interrupt Remapping is supported */ + int (*supported)(void); + + /* Initializes hardware and makes it ready for remapping interrupts */ + int (*prepare)(void); + + /* Enables the remapping hardware */ + int (*enable)(void); + + /* Disables the remapping hardware */ + void (*disable)(void); + + /* Reenables the remapping hardware */ + int (*reenable)(int); + + /* Enable fault handling */ + int (*enable_faulting)(void); + + /* IO-APIC setup routine */ + int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *, + unsigned int, int, + struct io_apic_irq_attr *); + + /* Set the CPU affinity of a remapped interrupt */ + int (*set_affinity)(struct irq_data *data, const struct cpumask *mask, + bool force); + + /* Free an IRQ */ + int (*free_irq)(int); + + /* Create MSI msg to use for interrupt remapping */ + void (*compose_msi_msg)(struct pci_dev *, + unsigned int, unsigned int, + struct msi_msg *, u8); + + /* Allocate remapping resources for MSI */ + int (*msi_alloc_irq)(struct pci_dev *, int, int); + + /* Setup the remapped MSI irq */ + int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int); + + /* Setup interrupt remapping for an HPET MSI */ + int (*setup_hpet_msi)(unsigned int, unsigned int); +}; + +extern struct irq_remap_ops intel_irq_remap_ops; + +#endif /* CONFIG_IRQ_REMAP */ + +#endif /* __IRQ_REMAPPING_H */ -- cgit v1.2.3 From d2aa37411b8e65d57d2c5ae36f0222274292020d Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Thu, 12 Apr 2012 18:18:24 +0100 Subject: x86/olpc/xo1/sci: Produce wakeup events for buttons and switches Produce wakeup events for the XO-1's power button, lid switch and ebook switch, taking care to only produce events when the states have changed. Signed-off-by: Daniel Drake Cc: dilinger@queued.net Cc: pgf@laptop.org Link: http://lkml.kernel.org/r/20120412171824.D14C49D401E@zog.reactivated.net Signed-off-by: Ingo Molnar --- arch/x86/platform/olpc/olpc-xo1-sci.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 1d4c783d7325..4b93ff46cec3 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -83,8 +84,12 @@ static void send_ebook_state(void) return; } + if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state) + return; /* Nothing new to report. */ + input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state); input_sync(ebook_switch_idev); + pm_wakeup_event(&ebook_switch_idev->dev, 0); } static void flip_lid_inverter(void) @@ -123,8 +128,12 @@ static void detect_lid_state(void) /* Report current lid switch state through input layer */ static void send_lid_state(void) { + if (!!test_bit(SW_LID, lid_switch_idev->sw) == !lid_open) + return; /* Nothing new to report. */ + input_report_switch(lid_switch_idev, SW_LID, !lid_open); input_sync(lid_switch_idev); + pm_wakeup_event(&lid_switch_idev->dev, 0); } static ssize_t lid_wake_mode_show(struct device *dev, @@ -213,11 +222,18 @@ static irqreturn_t xo1_sci_intr(int irq, void *dev_id) dev_dbg(&pdev->dev, "sts %x gpe %x\n", sts, gpe); - if (sts & CS5536_PWRBTN_FLAG && !(sts & CS5536_WAK_FLAG)) { - input_report_key(power_button_idev, KEY_POWER, 1); - input_sync(power_button_idev); - input_report_key(power_button_idev, KEY_POWER, 0); - input_sync(power_button_idev); + if (sts & CS5536_PWRBTN_FLAG) { + if (!(sts & CS5536_WAK_FLAG)) { + /* Only report power button input when it was pressed + * during regular operation (as opposed to when it + * was used to wake the system). */ + input_report_key(power_button_idev, KEY_POWER, 1); + input_sync(power_button_idev); + input_report_key(power_button_idev, KEY_POWER, 0); + input_sync(power_button_idev); + } + /* Report the wakeup event in all cases. */ + pm_wakeup_event(&power_button_idev->dev, 0); } if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */ -- cgit v1.2.3 From c2c21e9bb17549e8add4ff76931bcec2e2d3ad48 Mon Sep 17 00:00:00 2001 From: Daniel Drake Date: Wed, 18 Apr 2012 23:34:02 +0100 Subject: x86/olpc/xo1/sci: Report RTC wakeup events When the system is woken due to a RTC event, report the wakeup event on the relevant rtc device (if it can be found). Signed-off-by: Daniel Drake Cc: dilinger@queued.net Cc: pgf@laptop.org Link: http://lkml.kernel.org/r/20120418223402.D73249D401E@zog.reactivated.net Signed-off-by: Ingo Molnar --- arch/x86/platform/olpc/olpc-xo1-sci.c | 17 +++++++++++++++-- include/linux/cs5535.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c index 4b93ff46cec3..04b8c73659c5 100644 --- a/arch/x86/platform/olpc/olpc-xo1-sci.c +++ b/arch/x86/platform/olpc/olpc-xo1-sci.c @@ -236,6 +236,18 @@ static irqreturn_t xo1_sci_intr(int irq, void *dev_id) pm_wakeup_event(&power_button_idev->dev, 0); } + if ((sts & (CS5536_RTC_FLAG | CS5536_WAK_FLAG)) == + (CS5536_RTC_FLAG | CS5536_WAK_FLAG)) { + /* When the system is woken by the RTC alarm, report the + * event on the rtc device. */ + struct device *rtc = bus_find_device_by_name( + &platform_bus_type, NULL, "rtc_cmos"); + if (rtc) { + pm_wakeup_event(rtc, 0); + put_device(rtc); + } + } + if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */ cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS); schedule_work(&sci_work); @@ -326,9 +338,10 @@ static int __devinit setup_sci_interrupt(struct platform_device *pdev) outb(lo, CS5536_PIC_INT_SEL2); } - /* Enable SCI from power button, and clear pending interrupts */ + /* Enable interesting SCI events, and clear pending interrupts */ sts = inl(acpi_base + CS5536_PM1_STS); - outl((CS5536_PM_PWRBTN << 16) | 0xffff, acpi_base + CS5536_PM1_STS); + outl(((CS5536_PM_PWRBTN | CS5536_PM_RTC) << 16) | 0xffff, + acpi_base + CS5536_PM1_STS); r = request_irq(sci_irq, xo1_sci_intr, 0, DRV_NAME, pdev); if (r) diff --git a/include/linux/cs5535.h b/include/linux/cs5535.h index c077aec3a6ff..cfe83239d7f0 100644 --- a/include/linux/cs5535.h +++ b/include/linux/cs5535.h @@ -95,6 +95,7 @@ static inline int cs5535_pic_unreqz_select_high(unsigned int group, /* CS5536_PM1_STS bits */ #define CS5536_WAK_FLAG (1 << 15) +#define CS5536_RTC_FLAG (1 << 10) #define CS5536_PWRBTN_FLAG (1 << 8) /* CS5536_PM1_EN bits */ -- cgit v1.2.3 From 396e2c6fed4ff13b53ce0e573105531cf53b0cad Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 2 Apr 2012 15:15:55 +0100 Subject: x86: Clear HPET configuration registers on startup While Linux itself has been calling hpet_disable() for quite a while, having e.g. a secondary (kexec) kernel depend on such behavior of the primary (crashed) environment is fragile. It particularly broke until very recently when the primary environment was Xen based, as that hypervisor did not clear any of the HPET settings it may have used. Rather than blindly (and incompletely) clearing certain HPET settings in hpet_disable(), latch the config register settings during boot and restore then here. (Note on the hpet_set_mode() change: Now that we're clearing the level bit upon initialization, there's no need anymore to do so here.) Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F79D0BB020000780007C02D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 59 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad0de0c2714e..70bce5db1bb9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -319,8 +319,6 @@ static void hpet_set_mode(enum clock_event_mode mode, now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned int) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); - /* Make sure we use edge triggered interrupts */ - cfg &= ~HPET_TN_LEVEL; cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); @@ -787,15 +785,16 @@ static int hpet_clocksource_register(void) return 0; } +static u32 *hpet_boot_cfg; + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { - unsigned long hpet_period; - unsigned int id; + u32 hpet_period, cfg, id; u64 freq; - int i; + unsigned int i, last; if (!is_hpet_capable()) return 0; @@ -847,15 +846,45 @@ int __init hpet_enable(void) id = hpet_readl(HPET_ID); hpet_print_config(); + last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + #ifdef CONFIG_HPET_EMULATE_RTC /* * The legacy routing mode needs at least two channels, tick timer * and the rtc emulation channel. */ - if (!(id & HPET_ID_NUMBER)) + if (!last) goto out_nohpet; #endif + cfg = hpet_readl(HPET_CFG); + hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg), + GFP_KERNEL); + if (hpet_boot_cfg) + *hpet_boot_cfg = cfg; + else + pr_warn("HPET initial state will not be saved\n"); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_Tn_CFG(i)); + if (cfg) + pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", + cfg); + + for (i = 0; i <= last; ++i) { + cfg = hpet_readl(HPET_Tn_CFG(i)); + if (hpet_boot_cfg) + hpet_boot_cfg[i + 1] = cfg; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(i)); + cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP + | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE + | HPET_TN_FSB | HPET_TN_FSB_CAP); + if (cfg) + pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n", + cfg, i); + } + hpet_print_config(); + if (hpet_clocksource_register()) goto out_nohpet; @@ -923,14 +952,28 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { if (is_hpet_capable() && hpet_virt_address) { - unsigned int cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG), id, last; - if (hpet_legacy_int_enabled) { + if (hpet_boot_cfg) + cfg = *hpet_boot_cfg; + else if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; hpet_legacy_int_enabled = 0; } cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); + + if (!hpet_boot_cfg) + return; + + id = hpet_readl(HPET_ID); + last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + + for (id = 0; id <= last; ++id) + hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id)); + + if (*hpet_boot_cfg & HPET_CFG_ENABLE) + hpet_writel(*hpet_boot_cfg, HPET_CFG); } } -- cgit v1.2.3 From b2d6aba9657c7e3d027dd43ac7d7c405e0079d46 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 2 Apr 2012 15:17:36 +0100 Subject: x86: Allow multiple values to be specified with "hpet=" This is particularly to be able to specify "hpet=force,verbose", as "force" ought to be a primary candidate for also wanting to use "verbose". Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F79D120020000780007C031@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 70bce5db1bb9..9cc7b4392f7c 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -94,13 +94,18 @@ static int hpet_verbose; static int __init hpet_setup(char *str) { - if (str) { + while (str) { + char *next = strchr(str, ','); + + if (next) + *next++ = 0; if (!strncmp("disable", str, 7)) boot_hpet_disable = 1; if (!strncmp("force", str, 5)) hpet_force_user = 1; if (!strncmp("verbose", str, 7)) hpet_verbose = 1; + str = next; } return 1; } -- cgit v1.2.3 From ddc5681ed33a279fdc188e98e71f0c539f08c6e6 Mon Sep 17 00:00:00 2001 From: Shai Fultheim Date: Fri, 20 Apr 2012 01:09:11 +0300 Subject: x86/cache_info: Fix setup of l2/l3 ids On some architectures (such as vSMP), it is possible to have CPUs with a different number of cores sharing the same cache. The current implementation implicitly assumes that all CPUs will have the same number of cores sharing caches, and as a result, different CPUs can end up with the same l2/l3 ids. Fix this by masking out the shared cache bits, instead of shifting the APICID. By doing so, it is guaranteed that the generated cache ids are always unique. Signed-off-by: Shai Fultheim [ rebased, simplified, and reworded the commit message] Signed-off-by: Ido Yariv Cc: Borislav Petkov Cc: Andreas Herrmann Cc: Mike Travis Cc: Dave Jones Link: http://lkml.kernel.org/r/1334873351-31142-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index b8f3653dddbc..9a7c90d80bc4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid >> index_msb; + l2_id = c->apicid & ~((1 << index_msb) - 1); break; case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order( num_threads_sharing); - l3_id = c->apicid >> index_msb; + l3_id = c->apicid & ~((1 << index_msb) - 1); break; default: break; -- cgit v1.2.3 From 42fa4250436304d4650fa271f37671f6cee24e08 Mon Sep 17 00:00:00 2001 From: Shai Fultheim Date: Fri, 20 Apr 2012 01:12:32 +0300 Subject: x86: Conditionally update time when ack-ing pending irqs On virtual environments, apic_read could take a long time. As a result, under certain conditions the ack pending loop may exit without any queued irqs left, but after more than one second. A warning will be printed needlessly in this case. If the loop is about to exit regardless of max_loops, don't update it. Signed-off-by: Shai Fultheim [ rebased and reworded the commit message] Signed-off-by: Ido Yariv Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1334873552-31346-1-git-send-email-ido@wizery.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index edc24480469f..3beab627190e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1325,11 +1325,13 @@ void __cpuinit setup_local_APIC(void) acked); break; } - if (cpu_has_tsc) { - rdtscll(ntsc); - max_loops = (cpu_khz << 10) - (ntsc - tsc); - } else - max_loops--; + if (queued) { + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } } while (queued && max_loops > 0); WARN_ON(max_loops <= 0); -- cgit v1.2.3 From 19209bbb8612004bc20a1f70ff12926f99fe2643 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 30 Apr 2012 12:26:56 +0530 Subject: x86/sched: Make mwait_usable() heed to "idle=" kernel parameters properly The checks that exist in mwait_usable() for "idle=" kernel parameters are insufficient. As a result, mwait_usable() can return 1 even if "idle=nomwait" or "idle=poll" or "idle=halt" parameters are passed. Of these cases, incorrect handling of idle=nomwait is a universal problem since mwait can get used for usual CPU idling. However the rest of the cases are problematic only during CPU Hotplug (offline) because, in the CPU offline path, the function mwait_play_dead() is called, which might result in mwait being used in the offline CPUs, if mwait_usable() happens to return 1. Fix these issues by checking for the boot time "idle=" kernel parameter properly in mwait_usable(). The first issue (usual cpu idling) is demonstrated below: Before applying the patch (dmesg snippet): [ 0.000000] Command line: [...] idle=nomwait [ 0.000000] Kernel command line: [...] idle=nomwait [ 0.000000] RCU dyntick-idle grace-period acceleration is enabled. [ 0.140606] using mwait in idle threads. <======= mwait being used [ 4.303986] cpuidle: using governor ladder [ 4.308232] cpuidle: using governor menu After applying the patch: [ 0.000000] Command line: [...] idle=nomwait [ 0.000000] Kernel command line: [...] idle=nomwait [ 0.000000] RCU dyntick-idle grace-period acceleration is enabled. [ 4.264100] cpuidle: using governor ladder [ 4.268342] cpuidle: using governor menu Signed-off-by: Srivatsa S. Bhat Acked-by: Deepthi Dharwar Acked-by: Thomas Gleixner Cc: venki@google.com Cc: suresh.b.siddha@intel.com Cc: Borislav Petkov Cc: lenb@kernel.org Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/4F9E37B8.30001@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d92a5ab6e8b..ad57d832d96f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -594,9 +594,17 @@ int mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; + /* Use mwait if idle=mwait boot option is given */ if (boot_option_idle_override == IDLE_FORCE_MWAIT) return 1; + /* + * Any idle= boot option other than idle=mwait means that we must not + * use mwait. Eg: idle=halt or idle=poll or idle=nomwait + */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return 0; + if (c->cpuid_level < MWAIT_INFO) return 0; -- cgit v1.2.3 From 74d24b219bc4ebb20b75d63af2bb577bc1b10b5e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 26 Apr 2012 15:32:55 +0800 Subject: resources: add resource_overlaps() Add resource_overlaps(), which returns true if two resources overlap at all. Use this to replace the complicated check in coalesce_windows(). Signed-Off-By: Wei Yang Signed-off-by: Bjorn Helgaas --- arch/x86/pci/acpi.c | 12 +----------- include/linux/ioport.h | 7 +++++++ 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 8a17b23f8c84..fc09c2754e08 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -245,13 +245,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } -static bool resource_contains(struct resource *res, resource_size_t point) -{ - if (res->start <= point && point <= res->end) - return true; - return false; -} - static void coalesce_windows(struct pci_root_info *info, unsigned long type) { int i, j; @@ -272,10 +265,7 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type) * our resources no longer match the ACPI _CRS, but * the kernel resource tree doesn't allow overlaps. */ - if (resource_contains(res1, res2->start) || - resource_contains(res1, res2->end) || - resource_contains(res2, res1->start) || - resource_contains(res2, res1->end)) { + if (resource_overlaps(res1, res2)) { res1->start = min(res1->start, res2->start); res1->end = max(res1->end, res2->end); dev_info(&info->bridge->dev, diff --git a/include/linux/ioport.h b/include/linux/ioport.h index e885ba23de70..589e0e75efae 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -223,5 +223,12 @@ extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); +/* True if any part of r1 overlaps r2 */ +static inline bool resource_overlaps(struct resource *r1, struct resource *r2) +{ + return (r1->start <= r2->end && r1->end >= r2->start); +} + + #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ -- cgit v1.2.3 From 9438ef7f4ea73d5430a330fc206f97826eb9fb16 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 7 May 2012 19:19:56 +0200 Subject: x86/apic: Fix UP boot crash Commit 31b3c9d72340 ("xen/x86: Implement x86_apic_ops") implemented this: ... without considering that on UP the function pointer might be NULL. Cc: Suresh Siddha Cc: Konrad Rzeszutek Wilk Link: http://lkml.kernel.org/n/tip-3pfty0ml4yp62phbkchichh0@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8526317c5f0b..7e67c5a71061 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1012,7 +1012,8 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); init_apic_mappings(); - x86_io_apic_ops.init(); + if (x86_io_apic_ops.init) + x86_io_apic_ops.init(); kvm_guest_init(); -- cgit v1.2.3 From e826abd523913f63eb03b59746ffb16153c53dc4 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Sun, 6 May 2012 11:11:04 -0600 Subject: x86, microcode: microcode_core.c simple_strtoul cleanup Change reload_for_cpu() in kernel/microcode_core.c to call kstrtoul() instead of calling obsoleted simple_strtoul(). Signed-off-by: Shuah Khan Reviewed-by: Borislav Petkov Link: http://lkml.kernel.org/r/1336324264.2897.9.camel@lorien2 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index c9bda6d6035c..fbdfc6917180 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -299,12 +299,11 @@ static ssize_t reload_store(struct device *dev, { unsigned long val; int cpu = dev->id; - int ret = 0; - char *end; + ssize_t ret = 0; - val = simple_strtoul(buf, &end, 0); - if (end == buf) - return -EINVAL; + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (val == 1) { get_online_cpus(); -- cgit v1.2.3 From ca1182387e57470460294ce1e39e2d5518809811 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 30 Mar 2012 15:37:07 -0400 Subject: xen/setup: Only print "Freeing XXX-YYY pfn range: Z pages freed" if Z > 0 Otherwise we can get these meaningless: Freeing bad80-badf4 pfn range: 0 pages freed We also can do this for the summary ones - no point of printing "Set 0 page(s) to 1-1 mapping" Acked-by: David Vrabel [v1: Extended to the summary printks] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1ba8dff26753..7b0ab77b8479 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -114,8 +114,9 @@ static unsigned long __init xen_release_chunk(unsigned long start, len++; } } - printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", - start, end, len); + if (len) + printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", + start, end, len); return len; } @@ -162,8 +163,10 @@ static unsigned long __init xen_set_identity_and_release( } } - printk(KERN_INFO "Released %lu pages of unused memory\n", released); - printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + if (released) + printk(KERN_INFO "Released %lu pages of unused memory\n", released); + if (identity) + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); return released; } -- cgit v1.2.3 From 2e2fb75475c2fc74c98100f1468c8195fee49f3b Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 6 Apr 2012 10:07:11 -0400 Subject: xen/setup: Populate freed MFNs from non-RAM E820 entries and gaps to E820 RAM When the Xen hypervisor boots a PV kernel it hands it two pieces of information: nr_pages and a made up E820 entry. The nr_pages value defines the range from zero to nr_pages of PFNs which have a valid Machine Frame Number (MFN) underneath it. The E820 mirrors that (with the VGA hole): BIOS-provided physical RAM map: Xen: 0000000000000000 - 00000000000a0000 (usable) Xen: 00000000000a0000 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000080800000 (usable) The fun comes when a PV guest that is run with a machine E820 - that can either be the initial domain or a PCI PV guest, where the E820 looks like the normal thing: BIOS-provided physical RAM map: Xen: 0000000000000000 - 000000000009e000 (usable) Xen: 000000000009ec00 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000020000000 (usable) Xen: 0000000020000000 - 0000000020200000 (reserved) Xen: 0000000020200000 - 0000000040000000 (usable) Xen: 0000000040000000 - 0000000040200000 (reserved) Xen: 0000000040200000 - 00000000bad80000 (usable) Xen: 00000000bad80000 - 00000000badc9000 (ACPI NVS) .. With that overlaying the nr_pages directly on the E820 does not work as there are gaps and non-RAM regions that won't be used by the memory allocator. The 'xen_release_chunk' helps with that by punching holes in the P2M (PFN to MFN lookup tree) for those regions and tells us that: Freeing 20000-20200 pfn range: 512 pages freed Freeing 40000-40200 pfn range: 512 pages freed Freeing bad80-badf4 pfn range: 116 pages freed Freeing badf6-bae7f pfn range: 137 pages freed Freeing bb000-100000 pfn range: 282624 pages freed Released 283999 pages of unused memory Those 283999 pages are subtracted from the nr_pages and are returned to the hypervisor. The end result is that the initial domain boots with 1GB less memory as the nr_pages has been subtracted by the amount of pages residing within the PCI hole. It can balloon up to that if desired using 'xl mem-set 0 8092', but the balloon driver is not always compiled in for the initial domain. This patch, implements the populate hypercall (XENMEM_populate_physmap) which increases the the domain with the same amount of pages that were released. The other solution (that did not work) was to transplant the MFN in the P2M tree - the ones that were going to be freed were put in the E820_RAM regions past the nr_pages. But the modifications to the M2P array (the other side of creating PTEs) were not carried away. As the hypervisor is the only one capable of modifying that and the only two hypercalls that would do this are: the update_va_mapping (which won't work, as during initial bootup only PFNs up to nr_pages are mapped in the guest) or via the populate hypercall. The end result is that the kernel can now boot with the nr_pages without having to subtract the 283999 pages. On a 8GB machine, with various dom0_mem= parameters this is what we get: no dom0_mem -Memory: 6485264k/9435136k available (5817k kernel code, 1136060k absent, 1813812k reserved, 2899k data, 696k init) +Memory: 7619036k/9435136k available (5817k kernel code, 1136060k absent, 680040k reserved, 2899k data, 696k init) dom0_mem=3G -Memory: 2616536k/9435136k available (5817k kernel code, 1136060k absent, 5682540k reserved, 2899k data, 696k init) +Memory: 2703776k/9435136k available (5817k kernel code, 1136060k absent, 5595300k reserved, 2899k data, 696k init) dom0_mem=max:3G -Memory: 2696732k/4281724k available (5817k kernel code, 1136060k absent, 448932k reserved, 2899k data, 696k init) +Memory: 2702204k/4281724k available (5817k kernel code, 1136060k absent, 443460k reserved, 2899k data, 696k init) And the 'xm list' or 'xl list' now reflect what the dom0_mem= argument is. Acked-by: David Vrabel [v2: Use populate hypercall] [v3: Remove debug printks] [v4: Simplify code] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7b0ab77b8479..710af36e6dfb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -26,7 +26,6 @@ #include #include #include - #include "xen-ops.h" #include "vdso.h" @@ -120,7 +119,105 @@ static unsigned long __init xen_release_chunk(unsigned long start, return len; } +static unsigned long __init xen_populate_physmap(unsigned long start, + unsigned long end) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long len = 0; + int ret; + + for (pfn = start; pfn < end; pfn++) { + unsigned long frame; + + /* Make sure pfn does not exists to start with */ + if (pfn_to_mfn(pfn) != INVALID_P2M_ENTRY) + continue; + frame = pfn; + set_xen_guest_handle(reservation.extent_start, &frame); + reservation.nr_extents = 1; + + ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + WARN(ret != 1, "Failed to populate pfn %lx err=%d\n", pfn, ret); + if (ret == 1) { + if (!early_set_phys_to_machine(pfn, frame)) { + set_xen_guest_handle(reservation.extent_start, &frame); + reservation.nr_extents = 1; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + break; + } + len++; + } else + break; + } + if (len) + printk(KERN_INFO "Populated %lx-%lx pfn range: %lu pages added\n", + start, end, len); + return len; +} +static unsigned long __init xen_populate_chunk( + const struct e820entry *list, size_t map_size, + unsigned long max_pfn, unsigned long *last_pfn, + unsigned long credits_left) +{ + const struct e820entry *entry; + unsigned int i; + unsigned long done = 0; + unsigned long dest_pfn; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + unsigned long credits = credits_left; + unsigned long s_pfn; + unsigned long e_pfn; + unsigned long pfns; + long capacity; + + if (credits <= 0) + break; + + if (entry->type != E820_RAM) + continue; + + e_pfn = PFN_UP(entry->addr + entry->size); + + /* We only care about E820 after the xen_start_info->nr_pages */ + if (e_pfn <= max_pfn) + continue; + + s_pfn = PFN_DOWN(entry->addr); + /* If the E820 falls within the nr_pages, we want to start + * at the nr_pages PFN. + * If that would mean going past the E820 entry, skip it + */ + if (s_pfn <= max_pfn) { + capacity = e_pfn - max_pfn; + dest_pfn = max_pfn; + } else { + /* last_pfn MUST be within E820_RAM regions */ + if (*last_pfn && e_pfn >= *last_pfn) + s_pfn = *last_pfn; + capacity = e_pfn - s_pfn; + dest_pfn = s_pfn; + } + /* If we had filled this E820_RAM entry, go to the next one. */ + if (capacity <= 0) + continue; + + if (credits > capacity) + credits = capacity; + + pfns = xen_populate_physmap(dest_pfn, dest_pfn + credits); + done += pfns; + credits_left -= pfns; + *last_pfn = (dest_pfn + pfns); + } + return done; +} static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { @@ -143,7 +240,6 @@ static unsigned long __init xen_set_identity_and_release( */ for (i = 0, entry = list; i < map_size; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -220,7 +316,9 @@ char * __init xen_memory_setup(void) int rc; struct xen_memory_map memmap; unsigned long max_pages; + unsigned long last_pfn = 0; unsigned long extra_pages = 0; + unsigned long populated; int i; int op; @@ -260,8 +358,19 @@ char * __init xen_memory_setup(void) */ xen_released_pages = xen_set_identity_and_release( map, memmap.nr_entries, max_pfn); - extra_pages += xen_released_pages; + /* + * Populate back the non-RAM pages and E820 gaps that had been + * released. */ + populated = xen_populate_chunk(map, memmap.nr_entries, + max_pfn, &last_pfn, xen_released_pages); + + extra_pages += (xen_released_pages - populated); + + if (last_pfn > max_pfn) { + max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); + mem_end = PFN_PHYS(max_pfn); + } /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base @@ -275,7 +384,6 @@ char * __init xen_memory_setup(void) */ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); - i = 0; while (i < memmap.nr_entries) { u64 addr = map[i].addr; -- cgit v1.2.3 From 96dc08b35c4af8cb5810450602590706f2593a5f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 6 Apr 2012 16:10:20 -0400 Subject: xen/setup: Combine the two hypercall functions - since they are quite similar. They use the same set of arguments, so it is just the matter of using the proper hypercall. Acked-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 81 +++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 710af36e6dfb..30ac05a8d28f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -83,8 +83,8 @@ static void __init xen_add_extra_mem(u64 start, u64 size) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } -static unsigned long __init xen_release_chunk(unsigned long start, - unsigned long end) +static unsigned long __init xen_do_chunk(unsigned long start, + unsigned long end, bool release) { struct xen_memory_reservation reservation = { .address_bits = 0, @@ -95,60 +95,36 @@ static unsigned long __init xen_release_chunk(unsigned long start, unsigned long pfn; int ret; - for(pfn = start; pfn < end; pfn++) { - unsigned long mfn = pfn_to_mfn(pfn); - - /* Make sure pfn exists to start with */ - if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) - continue; - - set_xen_guest_handle(reservation.extent_start, &mfn); - reservation.nr_extents = 1; - - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); - if (ret == 1) { - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - len++; - } - } - if (len) - printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", - start, end, len); - - return len; -} -static unsigned long __init xen_populate_physmap(unsigned long start, - unsigned long end) -{ - struct xen_memory_reservation reservation = { - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - unsigned long len = 0; - int ret; - for (pfn = start; pfn < end; pfn++) { unsigned long frame; + unsigned long mfn = pfn_to_mfn(pfn); - /* Make sure pfn does not exists to start with */ - if (pfn_to_mfn(pfn) != INVALID_P2M_ENTRY) - continue; - - frame = pfn; + if (release) { + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + frame = mfn; + } else { + if (mfn != INVALID_P2M_ENTRY) + continue; + frame = pfn; + } set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; - ret = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - WARN(ret != 1, "Failed to populate pfn %lx err=%d\n", pfn, ret); + ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, + &reservation); + WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", + release ? "release" : "populate", pfn, ret); + if (ret == 1) { - if (!early_set_phys_to_machine(pfn, frame)) { + if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { + if (release) + break; set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); + &reservation); break; } len++; @@ -156,8 +132,11 @@ static unsigned long __init xen_populate_physmap(unsigned long start, break; } if (len) - printk(KERN_INFO "Populated %lx-%lx pfn range: %lu pages added\n", - start, end, len); + printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", + release ? "Freeing" : "Populating", + start, end, len, + release ? "freed" : "added"); + return len; } static unsigned long __init xen_populate_chunk( @@ -211,7 +190,7 @@ static unsigned long __init xen_populate_chunk( if (credits > capacity) credits = capacity; - pfns = xen_populate_physmap(dest_pfn, dest_pfn + credits); + pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false); done += pfns; credits_left -= pfns; *last_pfn = (dest_pfn + pfns); @@ -249,8 +228,8 @@ static unsigned long __init xen_set_identity_and_release( if (start_pfn < end_pfn) { if (start_pfn < nr_pages) - released += xen_release_chunk( - start_pfn, min(end_pfn, nr_pages)); + released += xen_do_chunk( + start_pfn, min(end_pfn, nr_pages), true); identity += set_phys_range_identity( start_pfn, end_pfn); -- cgit v1.2.3 From 83d51ab473dddde7df858015070ed22b84ebe9a9 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 3 May 2012 16:15:42 +0100 Subject: xen/setup: update VA mapping when releasing memory during setup In xen_memory_setup(), if a page that is being released has a VA mapping this must also be updated. Otherwise, the page will be not released completely -- it will still be referenced in Xen and won't be freed util the mapping is removed and this prevents it from being reallocated at a different PFN. This was already being done for the ISA memory region in xen_ident_map_ISA() but on many systems this was omitting a few pages as many systems marked a few pages below the ISA memory region as reserved in the e820 map. This fixes errors such as: (XEN) page_alloc.c:1148:d0 Over-allocation for domain 0: 2097153 > 2097152 (XEN) memory.c:133:d0 Could not allocate order=0 extent: id=0 memflags=0 (0 of 17) Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 - arch/x86/xen/mmu.c | 23 ----------------------- arch/x86/xen/setup.c | 41 ++++++++++++++++++++++++++++++++++------- arch/x86/xen/xen-ops.h | 1 - 4 files changed, 34 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fe06bf4ef0e3..ac90e5629508 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1308,7 +1308,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); - xen_ident_map_ISA(); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 91dc2871e336..c9a351925a0c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1929,29 +1929,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -void __init xen_ident_map_ISA(void) -{ - unsigned long pa; - - /* - * If we're dom0, then linear map the ISA machine addresses into - * the kernel's address space. - */ - if (!xen_initial_domain()) - return; - - xen_raw_printk("Xen: setup ISA identity maps\n"); - - for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { - pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); - - if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) - BUG(); - } - - xen_flush_tlb(); -} - static void __init xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 30ac05a8d28f..3ebba0753d38 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -139,6 +139,13 @@ static unsigned long __init xen_do_chunk(unsigned long start, return len; } + +static unsigned long __init xen_release_chunk(unsigned long start, + unsigned long end) +{ + return xen_do_chunk(start, end, true); +} + static unsigned long __init xen_populate_chunk( const struct e820entry *list, size_t map_size, unsigned long max_pfn, unsigned long *last_pfn, @@ -197,6 +204,29 @@ static unsigned long __init xen_populate_chunk( } return done; } + +static void __init xen_set_identity_and_release_chunk( + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long *released, unsigned long *identity) +{ + unsigned long pfn; + + /* + * If the PFNs are currently mapped, the VA mapping also needs + * to be updated to be 1:1. + */ + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + (void)HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn, PAGE_KERNEL_IO), 0); + + if (start_pfn < nr_pages) + *released += xen_release_chunk( + start_pfn, min(end_pfn, nr_pages)); + + *identity += set_phys_range_identity(start_pfn, end_pfn); +} + static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { @@ -226,14 +256,11 @@ static unsigned long __init xen_set_identity_and_release( if (entry->type == E820_RAM) end_pfn = PFN_UP(entry->addr); - if (start_pfn < end_pfn) { - if (start_pfn < nr_pages) - released += xen_do_chunk( - start_pfn, min(end_pfn, nr_pages), true); + if (start_pfn < end_pfn) + xen_set_identity_and_release_chunk( + start_pfn, end_pfn, nr_pages, + &released, &identity); - identity += set_phys_range_identity( - start_pfn, end_pfn); - } start = end; } } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b095739ccd4c..506fa08d934a 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -28,7 +28,6 @@ void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); -void xen_ident_map_ISA(void); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; -- cgit v1.2.3 From f447d56d36af18c5104ff29dcb1327c0c0ac3634 Mon Sep 17 00:00:00 2001 From: Ben Guthro Date: Sat, 21 Apr 2012 00:11:04 +0800 Subject: xen: implement apic ipi interface Map native ipi vector to xen vector. Implement apic ipi interface with xen_send_IPI_one. Tested-by: Steven Noonan Signed-off-by: Ben Guthro Signed-off-by: Lin Ming Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 9 ++++++ arch/x86/xen/smp.c | 81 +++++++++++++++++++++++++++++++++++++++++++++--- arch/x86/xen/smp.h | 12 +++++++ 3 files changed, 98 insertions(+), 4 deletions(-) create mode 100644 arch/x86/xen/smp.h (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4f51bebac02c..1ed61c2bf633 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -74,6 +74,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "smp.h" #include "multicalls.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -849,6 +850,14 @@ static void set_xen_basic_apic_ops(void) apic->icr_write = xen_apic_icr_write; apic->wait_icr_idle = xen_apic_wait_icr_idle; apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; + +#ifdef CONFIG_SMP + apic->send_IPI_allbutself = xen_send_IPI_allbutself; + apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself; + apic->send_IPI_mask = xen_send_IPI_mask; + apic->send_IPI_all = xen_send_IPI_all; + apic->send_IPI_self = xen_send_IPI_self; +#endif } #endif diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 5fac6919b957..2dc6628c1520 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -465,8 +465,8 @@ static void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } -static void xen_send_IPI_mask(const struct cpumask *mask, - enum ipi_vector vector) +static void __xen_send_IPI_mask(const struct cpumask *mask, + int vector) { unsigned cpu; @@ -478,7 +478,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); /* Make sure other vcpus get a chance to run if they need to. */ for_each_cpu(cpu, mask) { @@ -491,10 +491,83 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) static void xen_smp_send_call_function_single_ipi(int cpu) { - xen_send_IPI_mask(cpumask_of(cpu), + __xen_send_IPI_mask(cpumask_of(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); } +static inline int xen_map_vector(int vector) +{ + int xen_vector; + + switch (vector) { + case RESCHEDULE_VECTOR: + xen_vector = XEN_RESCHEDULE_VECTOR; + break; + case CALL_FUNCTION_VECTOR: + xen_vector = XEN_CALL_FUNCTION_VECTOR; + break; + case CALL_FUNCTION_SINGLE_VECTOR: + xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR; + break; + default: + xen_vector = -1; + printk(KERN_ERR "xen: vector 0x%x is not implemented\n", + vector); + } + + return xen_vector; +} + +void xen_send_IPI_mask(const struct cpumask *mask, + int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(mask, xen_vector); +} + +void xen_send_IPI_all(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(cpu_online_mask, xen_vector); +} + +void xen_send_IPI_self(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + xen_send_IPI_one(smp_processor_id(), xen_vector); +} + +void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned cpu; + unsigned int this_cpu = smp_processor_id(); + + if (!(num_online_cpus() > 1)) + return; + + for_each_cpu_and(cpu, mask, cpu_online_mask) { + if (this_cpu == cpu) + continue; + + xen_smp_send_call_function_single_ipi(cpu); + } +} + +void xen_send_IPI_allbutself(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector); +} + static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { irq_enter(); diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h new file mode 100644 index 000000000000..8981a76d081a --- /dev/null +++ b/arch/x86/xen/smp.h @@ -0,0 +1,12 @@ +#ifndef _XEN_SMP_H + +extern void xen_send_IPI_mask(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_allbutself(int vector); +extern void physflat_send_IPI_allbutself(int vector); +extern void xen_send_IPI_all(int vector); +extern void xen_send_IPI_self(int vector); + +#endif -- cgit v1.2.3 From 1ff2b0c303698e486f1e0886b4d9876200ef8ca5 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sat, 21 Apr 2012 00:11:05 +0800 Subject: xen: implement IRQ_WORK_VECTOR handler Signed-off-by: Lin Ming Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/events.h | 1 + arch/x86/xen/smp.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 1df35417c412..cc146d51449e 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -6,6 +6,7 @@ enum ipi_vector { XEN_CALL_FUNCTION_VECTOR, XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_SPIN_UNLOCK_VECTOR, + XEN_IRQ_WORK_VECTOR, XEN_NR_IPIS, }; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 2dc6628c1520..3ec3f8eb19fc 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -41,10 +42,12 @@ cpumask_var_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, xen_resched_irq); static DEFINE_PER_CPU(int, xen_callfunc_irq); static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); +static DEFINE_PER_CPU(int, xen_irq_work); static DEFINE_PER_CPU(int, xen_debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); /* * Reschedule call back. @@ -143,6 +146,17 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(xen_callfuncsingle_irq, cpu) = rc; + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, + cpu, + xen_irq_work_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(xen_irq_work, cpu) = rc; + return 0; fail: @@ -155,6 +169,8 @@ static int xen_smp_intr_init(unsigned int cpu) if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + if (per_cpu(xen_irq_work, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); return rc; } @@ -509,6 +525,9 @@ static inline int xen_map_vector(int vector) case CALL_FUNCTION_SINGLE_VECTOR: xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR; break; + case IRQ_WORK_VECTOR: + xen_vector = XEN_IRQ_WORK_VECTOR; + break; default: xen_vector = -1; printk(KERN_ERR "xen: vector 0x%x is not implemented\n", @@ -588,6 +607,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) +{ + irq_enter(); + irq_work_run(); + inc_irq_stat(apic_irq_work_irqs); + irq_exit(); + + return IRQ_HANDLED; +} + static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, @@ -634,6 +663,7 @@ static void xen_hvm_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); native_cpu_die(cpu); } -- cgit v1.2.3 From 211063dc159695bd6072c5393e9bc729481c6ede Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 8 Dec 2011 17:32:23 +0800 Subject: xen/acpi/sleep: Enable ACPI sleep via the __acpi_os_prepare_sleep Provide the registration callback to call in the Xen's ACPI sleep functionality. This means that during S3/S5 we make a hypercall XENPF_enter_acpi_sleep with the proper PM1A/PM1B registers. Based of Ke Yu's initial idea. [ From http://xenbits.xensource.com/linux-2.6.18-xen.hg change c68699484a65 ] [v1: Added Copyright and license] [v2: Added check if PM1A/B the 16-bits MSB contain something. The spec only uses 16-bits but might have more in future] Signed-off-by: Liang Tang Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 3 +++ drivers/xen/Makefile | 2 +- drivers/xen/acpi.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ include/xen/acpi.h | 58 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/acpi.c create mode 100644 include/xen/acpi.h (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1ed61c2bf633..eca90e5be1e7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1373,6 +1374,8 @@ asmlinkage void __init xen_start_kernel(void) /* Make sure ACS will be enabled */ pci_request_acs(); + + xen_acpi_sleep_register(); } diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 9adc5be57b13..fc3488631136 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PVHVM) += platform-pci.o obj-$(CONFIG_XEN_TMEM) += tmem.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o -obj-$(CONFIG_XEN_DOM0) += pci.o +obj-$(CONFIG_XEN_DOM0) += pci.o acpi.o obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c new file mode 100644 index 000000000000..119d42a2bf57 --- /dev/null +++ b/drivers/xen/acpi.c @@ -0,0 +1,62 @@ +/****************************************************************************** + * acpi.c + * acpi file for domain 0 kernel + * + * Copyright (c) 2011 Konrad Rzeszutek Wilk + * Copyright (c) 2011 Yu Ke ke.yu@intel.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include + +int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnt) +{ + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u = { + .enter_acpi_sleep = { + .pm1a_cnt_val = (u16)pm1a_cnt, + .pm1b_cnt_val = (u16)pm1b_cnt, + .sleep_state = sleep_state, + }, + }, + }; + + if ((pm1a_cnt & 0xffff0000) || (pm1b_cnt & 0xffff0000)) { + WARN(1, "Using more than 16bits of PM1A/B 0x%x/0x%x!" + "Email xen-devel@lists.xensource.com Thank you.\n", \ + pm1a_cnt, pm1b_cnt); + return -1; + } + + HYPERVISOR_dom0_op(&op); + return 1; +} diff --git a/include/xen/acpi.h b/include/xen/acpi.h new file mode 100644 index 000000000000..48a9c0171b65 --- /dev/null +++ b/include/xen/acpi.h @@ -0,0 +1,58 @@ +/****************************************************************************** + * acpi.h + * acpi file for domain 0 kernel + * + * Copyright (c) 2011 Konrad Rzeszutek Wilk + * Copyright (c) 2011 Yu Ke + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XEN_ACPI_H +#define _XEN_ACPI_H + +#include + +#ifdef CONFIG_XEN_DOM0 +#include +#include +#include + +int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnd); + +static inline void xen_acpi_sleep_register(void) +{ + if (xen_initial_domain()) + acpi_os_set_prepare_sleep( + &xen_acpi_notify_hypervisor_state); +} +#else +static inline void xen_acpi_sleep_register(void) +{ +} +#endif + +#endif /* _XEN_ACPI_H */ -- cgit v1.2.3 From 399988eea194a8453e283fdd2da968d1fd39a7cf Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 8 May 2012 00:08:52 -0700 Subject: irq_remap: Fix compiler warning with CONFIG_IRQ_REMAP=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the below compiler warning: arch/x86/include/asm/irq_remapping.h:72:19: warning: ‘struct IO_APIC_route_entry’ declared inside parameter list [enabled by default] Signed-off-by: Suresh Siddha Cc: joro@8bytes.org Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Link: http://lkml.kernel.org/r/1336460934-23592-1-git-send-email-suresh.b.siddha@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_remapping.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index dcb0c7231028..5fb9bbbd2f14 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -22,11 +22,9 @@ #ifndef __X86_IRQ_REMAPPING_H #define __X86_IRQ_REMAPPING_H -#ifdef CONFIG_IRQ_REMAP +#include -struct IO_APIC_route_entry; -struct io_apic_irq_attr; -struct pci_dev; +#ifdef CONFIG_IRQ_REMAP extern int irq_remapping_enabled; -- cgit v1.2.3 From d1ecad6eee8629c6b425580aad76cf99b85956e9 Mon Sep 17 00:00:00 2001 From: Márton Németh Date: Tue, 8 May 2012 00:24:20 -0700 Subject: x86/apic: Only compile local function if used with !CONFIG_GENERIC_PENDING_IRQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The local function io_apic_level_ack_pending() is only called from io_apic_level_ack_pending(). The later function is only compiled if CONFIG_GENERIC_PENDING_IRQ is defined. Move the io_apic_level_ack_pending() to the existing #ifdef CONFIG_GENERIC_PENDING_IRQ code block. This will remove the following warning message during compiling without CONFIG_GENERIC_PENDING_IRQ defined: * arch/x86/kernel/apic/io_apic.c:382: warning: ‘io_apic_level_ack_pending’ defined but not used Signed-off-by: Márton Németh Signed-off-by: Suresh Siddha Cc: Yinghai Lu Cc: Naga Chumbalkar Link: http://lkml.kernel.org/r/1336461860.2296.3.camel@sbsiddha-mobl2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 46 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 973539c128a4..e245365670a4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -346,29 +346,6 @@ void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val writel(value, &io_apic->data); } -static bool io_apic_level_ack_pending(struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - int pin; - - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -2519,6 +2496,29 @@ static void ack_apic_edge(struct irq_data *data) atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + int pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} + static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) { /* If we are moving the irq we need to mask it */ -- cgit v1.2.3 From 85f7f656274fa0ba109dd8774db3887d42de5c6b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 May 2012 17:59:49 +0000 Subject: x86: Use kick_all_cpus_sync() Use kick_all_cpus_sync() and remove cpu_idle_wait(). Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120507175652.190382227@linutronix.de Cc: x86@kernel.org --- arch/x86/Kconfig | 3 --- arch/x86/include/asm/processor.h | 2 -- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/process.c | 20 -------------------- 4 files changed, 1 insertion(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 046bf4bd2510..98876f55a2e0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -161,9 +161,6 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD -config ARCH_HAS_CPU_IDLE_WAIT - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4fa7dcceb6c0..ccbb1ea99ccb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -974,8 +974,6 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ -void cpu_idle_wait(void); - extern unsigned long arch_align_stack(unsigned long sp); extern void free_init_pages(char *what, unsigned long begin, unsigned long end); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 459e78cbf61e..07b0c0db466c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -2401,7 +2401,7 @@ static void __exit apm_exit(void) * (pm_idle), Wait for all processors to update cached/local * copies of pm_idle before proceeding. */ - cpu_idle_wait(); + kick_all_cpus_sync(); } if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) && (apm_info.connection_version > 0x0100)) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8aa532fa015d..8215458f6af5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,26 +525,6 @@ void stop_this_cpu(void *dummy) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { -- cgit v1.2.3 From 6c0a9fa62feb7e9fdefa9720bcc03040c9b0b311 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 May 2012 15:05:40 +0000 Subject: fork: Remove the weak insanity We error out when compiling with gcc4.1.[01] as it miscompiles __weak. The workaround with magic defines is not longer necessary. Make it __weak again. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120505150141.306358267@linutronix.de --- arch/sh/include/asm/thread_info.h | 1 - arch/x86/include/asm/thread_info.h | 1 - kernel/fork.c | 8 +------- 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index 20ee40af16e9..09963d4018cb 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -98,7 +98,6 @@ static inline struct thread_info *current_thread_info(void) extern struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node); extern void free_thread_info(struct thread_info *ti); extern void arch_task_cache_init(void); -#define arch_task_cache_init arch_task_cache_init extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); extern void init_thread_xstate(void); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6df8ccd715..8692a166dd4e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -284,6 +284,5 @@ static inline bool is_ia32_task(void) extern void arch_task_cache_init(void); extern void free_thread_info(struct thread_info *ti); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -#define arch_task_cache_init arch_task_cache_init #endif #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/kernel/fork.c b/kernel/fork.c index b9372a0bff18..a79b36e2e912 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -203,13 +203,7 @@ void __put_task_struct(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(__put_task_struct); -/* - * macro override instead of weak attribute alias, to workaround - * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. - */ -#ifndef arch_task_cache_init -#define arch_task_cache_init() -#endif +void __init __weak arch_task_cache_init(void) { } void __init fork_init(unsigned long mempages) { -- cgit v1.2.3 From 38e7c572ce7310def003d8bb7c34260f5d8118cb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 May 2012 15:05:42 +0000 Subject: x86: Use common threadinfo allocator The only difference is the free_thread_info function, which frees xstate. Use the new arch_release_task_struct() function instead and switch over to the core allocator. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/20120505150141.559556763@linutronix.de Cc: x86@kernel.org --- arch/x86/include/asm/boot.h | 2 +- arch/x86/include/asm/page_32_types.h | 4 ++-- arch/x86/include/asm/page_64_types.h | 4 ++-- arch/x86/include/asm/thread_info.h | 20 +------------------- arch/x86/kernel/irq_32.c | 8 ++++---- arch/x86/kernel/process.c | 5 ++--- 6 files changed, 12 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 5e1a2eef3e7c..b13fe63bdc59 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -19,7 +19,7 @@ #ifdef CONFIG_X86_64 #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else -#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_SIZE_ORDER) #endif #define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index ade619ff9e2a..ef17af013475 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -15,8 +15,8 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) -#define THREAD_ORDER 1 -#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 7639dbf5d223..320f7bb95f76 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,8 +1,8 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_ORDER 1 -#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define THREAD_SIZE_ORDER 1 +#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) #define EXCEPTION_STACK_ORDER 0 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8692a166dd4e..73cfe0d309c9 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -155,24 +155,6 @@ struct thread_info { #define PREEMPT_ACTIVE 0x10000000 -/* thread information allocation */ -#ifdef CONFIG_DEBUG_STACK_USAGE -#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) -#else -#define THREAD_FLAGS (GFP_KERNEL | __GFP_NOTRACK) -#endif - -#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR - -#define alloc_thread_info_node(tsk, node) \ -({ \ - struct page *page = alloc_pages_node(node, THREAD_FLAGS, \ - THREAD_ORDER); \ - struct thread_info *ret = page ? page_address(page) : NULL; \ - \ - ret; \ -}) - #ifdef CONFIG_X86_32 #define STACK_WARN (THREAD_SIZE/8) @@ -282,7 +264,7 @@ static inline bool is_ia32_task(void) #ifndef __ASSEMBLY__ extern void arch_task_cache_init(void); -extern void free_thread_info(struct thread_info *ti); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); +extern void arch_release_task_struct(struct task_struct *tsk); #endif #endif /* _ASM_X86_THREAD_INFO_H */ diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 58b7f27cb3e9..344faf8d0d62 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu) return; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; @@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu) per_cpu(hardirq_ctx, cpu) = irqctx; irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), - THREAD_FLAGS, - THREAD_ORDER)); + THREADINFO_GFP, + THREAD_SIZE_ORDER)); memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8215458f6af5..e8173154800d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -76,10 +76,9 @@ void free_thread_xstate(struct task_struct *tsk) fpu_free(&tsk->thread.fpu); } -void free_thread_info(struct thread_info *ti) +void arch_release_task_struct(struct task_struct *tsk) { - free_thread_xstate(ti->task); - free_pages((unsigned long)ti, THREAD_ORDER); + free_thread_xstate(tsk); } void arch_task_cache_init(void) -- cgit v1.2.3 From fba60c620a6a9ec11140c179e5d0fe0bc3c3ea29 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 8 May 2012 15:21:18 +0100 Subject: x86-64: Eliminate dead ia32 syscall handlers None of the three routines being removed here was actually hooked up anywhere, so they all represented dead code. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4FA947FE020000780008247F@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/sys_ia32.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index aec2202a596c..edca9c0a79cc 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -287,11 +287,6 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, return ret; } -asmlinkage long sys32_alarm(unsigned int seconds) -{ - return alarm_setitimer(seconds); -} - asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) { @@ -300,11 +295,6 @@ asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, /* 32-bit timeval and related flotsam. */ -asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2) -{ - return sys_sysfs(option, arg1, arg2); -} - asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) { @@ -375,19 +365,6 @@ asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, } -asmlinkage long sys32_personality(unsigned long personality) -{ - int ret; - - if (personality(current->personality) == PER_LINUX32 && - personality == PER_LINUX) - personality = PER_LINUX32; - ret = sys_personality(personality); - if (ret == PER_LINUX32) - ret = PER_LINUX; - return ret; -} - asmlinkage long sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) { -- cgit v1.2.3 From 433de739bbc22a5b2c87602116566ce27e3b4cab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:24 +0300 Subject: x86, realmode: 16-bit real-mode code support for relocs tool A new option is added to the relocs tool called '--realmode'. This option causes the generation of 16-bit segment relocations and 32-bit linear relocations for the real-mode code. When the real-mode code is moved to the low-memory during kernel initialization, these relocation entries can be used to relocate the code properly. In the assembly code 16-bit segment relocations must be relative to the 'real_mode_seg' absolute symbol. Linear relocations must be relative to a symbol prefixed with 'pa_'. 16-bit segment relocation is used to load cs:ip in 16-bit code. Linear relocations are used in the 32-bit code for relocatable data references. They are declared in the linker script of the real-mode code. The relocs tool is moved to scripts/x86-relocs.c so it will be compiled before building the arch/x86 tree. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-2-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: Jarkko Sakkinen Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 11 +- arch/x86/boot/compressed/relocs.c | 678 -------------------------------- scripts/.gitignore | 1 + scripts/Makefile | 3 + scripts/x86-relocs.c | 797 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 806 insertions(+), 684 deletions(-) delete mode 100644 arch/x86/boot/compressed/relocs.c create mode 100644 scripts/x86-relocs.c (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fd55a2ff3ad8..0435e8a2d20e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -40,13 +40,12 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) +targets += vmlinux.bin.all vmlinux.relocs -targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs - -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE +CMD_RELOCS = scripts/x86-relocs +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< +$(obj)/vmlinux.relocs: vmlinux FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c deleted file mode 100644 index fb7117a4ade1..000000000000 --- a/arch/x86/boot/compressed/relocs.c +++ /dev/null @@ -1,678 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define USE_BSD -#include -#include -#include - -static void die(char *fmt, ...); - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -static Elf32_Ehdr ehdr; -static unsigned long reloc_count, reloc_idx; -static unsigned long *relocs; - -struct section { - Elf32_Shdr shdr; - struct section *link; - Elf32_Sym *symtab; - Elf32_Rel *reltab; - char *strtab; -}; -static struct section *secs; - -/* - * Following symbols have been audited. There values are constant and do - * not change if bzImage is loaded at a different physical address than - * the address for which it has been compiled. Don't warn user about - * absolute relocations present w.r.t these symbols. - */ -static const char abs_sym_regex[] = - "^(xen_irq_disable_direct_reloc$|" - "xen_save_fl_direct_reloc$|" - "VDSO|" - "__crc_)"; -static regex_t abs_sym_regex_c; -static int is_abs_reloc(const char *sym_name) -{ - return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0); -} - -/* - * These symbols are known to be relative, even if the linker marks them - * as absolute (typically defined outside any section in the linker script.) - */ -static const char rel_sym_regex[] = - "^_end$"; -static regex_t rel_sym_regex_c; -static int is_rel_reloc(const char *sym_name) -{ - return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0); -} - -static void regex_init(void) -{ - char errbuf[128]; - int err; - - err = regcomp(&abs_sym_regex_c, abs_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); - } - - err = regcomp(&rel_sym_regex_c, rel_sym_regex, - REG_EXTENDED|REG_NOSUB); - if (err) { - regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf); - die("%s", errbuf); - } -} - -static void die(char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - vfprintf(stderr, fmt, ap); - va_end(ap); - exit(1); -} - -static const char *sym_type(unsigned type) -{ - static const char *type_name[] = { -#define SYM_TYPE(X) [X] = #X - SYM_TYPE(STT_NOTYPE), - SYM_TYPE(STT_OBJECT), - SYM_TYPE(STT_FUNC), - SYM_TYPE(STT_SECTION), - SYM_TYPE(STT_FILE), - SYM_TYPE(STT_COMMON), - SYM_TYPE(STT_TLS), -#undef SYM_TYPE - }; - const char *name = "unknown sym type name"; - if (type < ARRAY_SIZE(type_name)) { - name = type_name[type]; - } - return name; -} - -static const char *sym_bind(unsigned bind) -{ - static const char *bind_name[] = { -#define SYM_BIND(X) [X] = #X - SYM_BIND(STB_LOCAL), - SYM_BIND(STB_GLOBAL), - SYM_BIND(STB_WEAK), -#undef SYM_BIND - }; - const char *name = "unknown sym bind name"; - if (bind < ARRAY_SIZE(bind_name)) { - name = bind_name[bind]; - } - return name; -} - -static const char *sym_visibility(unsigned visibility) -{ - static const char *visibility_name[] = { -#define SYM_VISIBILITY(X) [X] = #X - SYM_VISIBILITY(STV_DEFAULT), - SYM_VISIBILITY(STV_INTERNAL), - SYM_VISIBILITY(STV_HIDDEN), - SYM_VISIBILITY(STV_PROTECTED), -#undef SYM_VISIBILITY - }; - const char *name = "unknown sym visibility name"; - if (visibility < ARRAY_SIZE(visibility_name)) { - name = visibility_name[visibility]; - } - return name; -} - -static const char *rel_type(unsigned type) -{ - static const char *type_name[] = { -#define REL_TYPE(X) [X] = #X - REL_TYPE(R_386_NONE), - REL_TYPE(R_386_32), - REL_TYPE(R_386_PC32), - REL_TYPE(R_386_GOT32), - REL_TYPE(R_386_PLT32), - REL_TYPE(R_386_COPY), - REL_TYPE(R_386_GLOB_DAT), - REL_TYPE(R_386_JMP_SLOT), - REL_TYPE(R_386_RELATIVE), - REL_TYPE(R_386_GOTOFF), - REL_TYPE(R_386_GOTPC), -#undef REL_TYPE - }; - const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name) && type_name[type]) { - name = type_name[type]; - } - return name; -} - -static const char *sec_name(unsigned shndx) -{ - const char *sec_strtab; - const char *name; - sec_strtab = secs[ehdr.e_shstrndx].strtab; - name = ""; - if (shndx < ehdr.e_shnum) { - name = sec_strtab + secs[shndx].shdr.sh_name; - } - else if (shndx == SHN_ABS) { - name = "ABSOLUTE"; - } - else if (shndx == SHN_COMMON) { - name = "COMMON"; - } - return name; -} - -static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) -{ - const char *name; - name = ""; - if (sym->st_name) { - name = sym_strtab + sym->st_name; - } - else { - name = sec_name(secs[sym->st_shndx].shdr.sh_name); - } - return name; -} - - - -#if BYTE_ORDER == LITTLE_ENDIAN -#define le16_to_cpu(val) (val) -#define le32_to_cpu(val) (val) -#endif -#if BYTE_ORDER == BIG_ENDIAN -#define le16_to_cpu(val) bswap_16(val) -#define le32_to_cpu(val) bswap_32(val) -#endif - -static uint16_t elf16_to_cpu(uint16_t val) -{ - return le16_to_cpu(val); -} - -static uint32_t elf32_to_cpu(uint32_t val) -{ - return le32_to_cpu(val); -} - -static void read_ehdr(FILE *fp) -{ - if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { - die("Cannot read ELF header: %s\n", - strerror(errno)); - } - if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { - die("No ELF magic\n"); - } - if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { - die("Not a 32 bit executable\n"); - } - if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { - die("Not a LSB ELF executable\n"); - } - if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { - die("Unknown ELF version\n"); - } - /* Convert the fields to native endian */ - ehdr.e_type = elf16_to_cpu(ehdr.e_type); - ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); - ehdr.e_version = elf32_to_cpu(ehdr.e_version); - ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); - ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); - ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); - ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); - ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); - ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); - ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); - ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); - ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); - ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); - - if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { - die("Unsupported ELF header type\n"); - } - if (ehdr.e_machine != EM_386) { - die("Not for x86\n"); - } - if (ehdr.e_version != EV_CURRENT) { - die("Unknown ELF version\n"); - } - if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { - die("Bad Elf header size\n"); - } - if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { - die("Bad program header entry\n"); - } - if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { - die("Bad section header entry\n"); - } - if (ehdr.e_shstrndx >= ehdr.e_shnum) { - die("String table index out of bounds\n"); - } -} - -static void read_shdrs(FILE *fp) -{ - int i; - Elf32_Shdr shdr; - - secs = calloc(ehdr.e_shnum, sizeof(struct section)); - if (!secs) { - die("Unable to allocate %d section headers\n", - ehdr.e_shnum); - } - if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - ehdr.e_shoff, strerror(errno)); - } - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (fread(&shdr, sizeof shdr, 1, fp) != 1) - die("Cannot read ELF section headers %d/%d: %s\n", - i, ehdr.e_shnum, strerror(errno)); - sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); - sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); - sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); - sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); - sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); - sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); - sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); - sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); - sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); - sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); - if (sec->shdr.sh_link < ehdr.e_shnum) - sec->link = &secs[sec->shdr.sh_link]; - } - -} - -static void read_strtabs(FILE *fp) -{ - int i; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_STRTAB) { - continue; - } - sec->strtab = malloc(sec->shdr.sh_size); - if (!sec->strtab) { - die("malloc of %d bytes for strtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - } -} - -static void read_symtabs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sec->symtab = malloc(sec->shdr.sh_size); - if (!sec->symtab) { - die("malloc of %d bytes for symtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym = &sec->symtab[j]; - sym->st_name = elf32_to_cpu(sym->st_name); - sym->st_value = elf32_to_cpu(sym->st_value); - sym->st_size = elf32_to_cpu(sym->st_size); - sym->st_shndx = elf16_to_cpu(sym->st_shndx); - } - } -} - - -static void read_relocs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec->reltab = malloc(sec->shdr.sh_size); - if (!sec->reltab) { - die("malloc of %d bytes for relocs failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel = &sec->reltab[j]; - rel->r_offset = elf32_to_cpu(rel->r_offset); - rel->r_info = elf32_to_cpu(rel->r_info); - } - } -} - - -static void print_absolute_symbols(void) -{ - int i; - printf("Absolute symbols\n"); - printf(" Num: Value Size Type Bind Visibility Name\n"); - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - char *sym_strtab; - int j; - - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sym_strtab = sec->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym; - const char *name; - sym = &sec->symtab[j]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - printf("%5d %08x %5d %10s %10s %12s %s\n", - j, sym->st_value, sym->st_size, - sym_type(ELF32_ST_TYPE(sym->st_info)), - sym_bind(ELF32_ST_BIND(sym->st_info)), - sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), - name); - } - } - printf("\n"); -} - -static void print_absolute_relocs(void) -{ - int i, printed = 0; - - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - struct section *sec_applies, *sec_symtab; - char *sym_strtab; - Elf32_Sym *sh_symtab; - int j; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - const char *name; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - - /* Absolute symbols are not relocated if bzImage is - * loaded at a non-compiled address. Display a warning - * to user at compile time about the absolute - * relocations present. - * - * User need to audit the code to make sure - * some symbols which should have been section - * relative have not become absolute because of some - * linker optimization or wrong programming usage. - * - * Before warning check if this absolute symbol - * relocation is harmless. - */ - if (is_abs_reloc(name) || is_rel_reloc(name)) - continue; - - if (!printed) { - printf("WARNING: Absolute relocations" - " present\n"); - printf("Offset Info Type Sym.Value " - "Sym.Name\n"); - printed = 1; - } - - printf("%08x %08x %10s %08x %s\n", - rel->r_offset, - rel->r_info, - rel_type(ELF32_R_TYPE(rel->r_info)), - sym->st_value, - name); - } - } - - if (printed) - printf("\n"); -} - -static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) -{ - int i; - /* Walk through the relocations */ - for (i = 0; i < ehdr.e_shnum; i++) { - char *sym_strtab; - Elf32_Sym *sh_symtab; - struct section *sec_applies, *sec_symtab; - int j; - struct section *sec = &secs[i]; - - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - unsigned r_type; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - r_type = ELF32_R_TYPE(rel->r_info); - /* Don't visit relocations to absolute symbols */ - if (sym->st_shndx == SHN_ABS && - !is_rel_reloc(sym_name(sym_strtab, sym))) { - continue; - } - switch (r_type) { - case R_386_NONE: - case R_386_PC32: - /* - * NONE can be ignored and and PC relative - * relocations don't need to be adjusted. - */ - break; - case R_386_32: - /* Visit relocations that need to be adjusted */ - visit(rel, sym); - break; - default: - die("Unsupported relocation type: %s (%d)\n", - rel_type(r_type), r_type); - break; - } - } - } -} - -static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - reloc_count += 1; -} - -static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - /* Remember the address that needs to be adjusted. */ - relocs[reloc_idx++] = rel->r_offset; -} - -static int cmp_relocs(const void *va, const void *vb) -{ - const unsigned long *a, *b; - a = va; b = vb; - return (*a == *b)? 0 : (*a > *b)? 1 : -1; -} - -static void emit_relocs(int as_text) -{ - int i; - /* Count how many relocations I have and allocate space for them. */ - reloc_count = 0; - walk_relocs(count_reloc); - relocs = malloc(reloc_count * sizeof(relocs[0])); - if (!relocs) { - die("malloc of %d entries for relocs failed\n", - reloc_count); - } - /* Collect up the relocations */ - reloc_idx = 0; - walk_relocs(collect_reloc); - - /* Order the relocations for more efficient processing */ - qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); - - /* Print the relocations */ - if (as_text) { - /* Print the relocations in a form suitable that - * gas will like. - */ - printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - for (i = 0; i < reloc_count; i++) { - printf("\t .long 0x%08lx\n", relocs[i]); - } - printf("\n"); - } - else { - unsigned char buf[4]; - /* Print a stop */ - fwrite("\0\0\0\0", 4, 1, stdout); - /* Now print each relocation */ - for (i = 0; i < reloc_count; i++) { - put_unaligned_le32(relocs[i], buf); - fwrite(buf, 4, 1, stdout); - } - } -} - -static void usage(void) -{ - die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n"); -} - -int main(int argc, char **argv) -{ - int show_absolute_syms, show_absolute_relocs; - int as_text; - const char *fname; - FILE *fp; - int i; - - regex_init(); - - show_absolute_syms = 0; - show_absolute_relocs = 0; - as_text = 0; - fname = NULL; - for (i = 1; i < argc; i++) { - char *arg = argv[i]; - if (*arg == '-') { - if (strcmp(argv[1], "--abs-syms") == 0) { - show_absolute_syms = 1; - continue; - } - - if (strcmp(argv[1], "--abs-relocs") == 0) { - show_absolute_relocs = 1; - continue; - } - else if (strcmp(argv[1], "--text") == 0) { - as_text = 1; - continue; - } - } - else if (!fname) { - fname = arg; - continue; - } - usage(); - } - if (!fname) { - usage(); - } - fp = fopen(fname, "r"); - if (!fp) { - die("Cannot open %s: %s\n", - fname, strerror(errno)); - } - read_ehdr(fp); - read_shdrs(fp); - read_strtabs(fp); - read_symtabs(fp); - read_relocs(fp); - if (show_absolute_syms) { - print_absolute_symbols(); - return 0; - } - if (show_absolute_relocs) { - print_absolute_relocs(); - return 0; - } - emit_relocs(as_text); - return 0; -} diff --git a/scripts/.gitignore b/scripts/.gitignore index 105b21f08185..68c0f32fdc9b 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -9,3 +9,4 @@ unifdef ihex2fw recordmcount docproc +x86-relocs diff --git a/scripts/Makefile b/scripts/Makefile index df7678febf27..a241359d2c82 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -8,11 +8,14 @@ # conmakehash: Create arrays for initializing the kernel console tables # docproc: Used in Documentation/DocBook +HOST_EXTRACFLAGS += -I$(srctree)/tools/include + hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount +hostprogs-$(CONFIG_X86) += x86-relocs always := $(hostprogs-y) $(hostprogs-m) diff --git a/scripts/x86-relocs.c b/scripts/x86-relocs.c new file mode 100644 index 000000000000..02914706e5b9 --- /dev/null +++ b/scripts/x86-relocs.c @@ -0,0 +1,797 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define USE_BSD +#include +#include +#include + +static void die(char *fmt, ...); + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +static Elf32_Ehdr ehdr; +static unsigned long reloc_count, reloc_idx; +static unsigned long *relocs; +static unsigned long reloc16_count, reloc16_idx; +static unsigned long *relocs16; + +struct section { + Elf32_Shdr shdr; + struct section *link; + Elf32_Sym *symtab; + Elf32_Rel *reltab; + char *strtab; +}; +static struct section *secs; + +enum symtype { + S_ABS, + S_REL, + S_SEG, + S_LIN, + S_NSYMTYPES +}; + +static const char * const sym_regex_kernel[S_NSYMTYPES] = { +/* + * Following symbols have been audited. There values are constant and do + * not change if bzImage is loaded at a different physical address than + * the address for which it has been compiled. Don't warn user about + * absolute relocations present w.r.t these symbols. + */ + [S_ABS] = + "^(xen_irq_disable_direct_reloc$|" + "xen_save_fl_direct_reloc$|" + "VDSO|" + "__crc_)", + +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ + [S_REL] = + "^_end$", +}; + + +static const char * const sym_regex_realmode[S_NSYMTYPES] = { +/* + * These are 16-bit segment symbols when compiling 16-bit code. + */ + [S_SEG] = + "^real_mode_seg$", + +/* + * These are offsets belonging to segments, as opposed to linear addresses, + * when compiling 16-bit code. + */ + [S_LIN] = + "^pa_", +}; + +static const char * const *sym_regex; + +static regex_t sym_regex_c[S_NSYMTYPES]; +static int is_reloc(enum symtype type, const char *sym_name) +{ + return sym_regex[type] && + !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); +} + +static void regex_init(int use_real_mode) +{ + char errbuf[128]; + int err; + int i; + + if (use_real_mode) + sym_regex = sym_regex_realmode; + else + sym_regex = sym_regex_kernel; + + for (i = 0; i < S_NSYMTYPES; i++) { + if (!sym_regex[i]) + continue; + + err = regcomp(&sym_regex_c[i], sym_regex[i], + REG_EXTENDED|REG_NOSUB); + + if (err) { + regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf); + die("%s", errbuf); + } + } +} + +static void die(char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(1); +} + +static const char *sym_type(unsigned type) +{ + static const char *type_name[] = { +#define SYM_TYPE(X) [X] = #X + SYM_TYPE(STT_NOTYPE), + SYM_TYPE(STT_OBJECT), + SYM_TYPE(STT_FUNC), + SYM_TYPE(STT_SECTION), + SYM_TYPE(STT_FILE), + SYM_TYPE(STT_COMMON), + SYM_TYPE(STT_TLS), +#undef SYM_TYPE + }; + const char *name = "unknown sym type name"; + if (type < ARRAY_SIZE(type_name)) { + name = type_name[type]; + } + return name; +} + +static const char *sym_bind(unsigned bind) +{ + static const char *bind_name[] = { +#define SYM_BIND(X) [X] = #X + SYM_BIND(STB_LOCAL), + SYM_BIND(STB_GLOBAL), + SYM_BIND(STB_WEAK), +#undef SYM_BIND + }; + const char *name = "unknown sym bind name"; + if (bind < ARRAY_SIZE(bind_name)) { + name = bind_name[bind]; + } + return name; +} + +static const char *sym_visibility(unsigned visibility) +{ + static const char *visibility_name[] = { +#define SYM_VISIBILITY(X) [X] = #X + SYM_VISIBILITY(STV_DEFAULT), + SYM_VISIBILITY(STV_INTERNAL), + SYM_VISIBILITY(STV_HIDDEN), + SYM_VISIBILITY(STV_PROTECTED), +#undef SYM_VISIBILITY + }; + const char *name = "unknown sym visibility name"; + if (visibility < ARRAY_SIZE(visibility_name)) { + name = visibility_name[visibility]; + } + return name; +} + +static const char *rel_type(unsigned type) +{ + static const char *type_name[] = { +#define REL_TYPE(X) [X] = #X + REL_TYPE(R_386_NONE), + REL_TYPE(R_386_32), + REL_TYPE(R_386_PC32), + REL_TYPE(R_386_GOT32), + REL_TYPE(R_386_PLT32), + REL_TYPE(R_386_COPY), + REL_TYPE(R_386_GLOB_DAT), + REL_TYPE(R_386_JMP_SLOT), + REL_TYPE(R_386_RELATIVE), + REL_TYPE(R_386_GOTOFF), + REL_TYPE(R_386_GOTPC), + REL_TYPE(R_386_8), + REL_TYPE(R_386_PC8), + REL_TYPE(R_386_16), + REL_TYPE(R_386_PC16), +#undef REL_TYPE + }; + const char *name = "unknown type rel type name"; + if (type < ARRAY_SIZE(type_name) && type_name[type]) { + name = type_name[type]; + } + return name; +} + +static const char *sec_name(unsigned shndx) +{ + const char *sec_strtab; + const char *name; + sec_strtab = secs[ehdr.e_shstrndx].strtab; + name = ""; + if (shndx < ehdr.e_shnum) { + name = sec_strtab + secs[shndx].shdr.sh_name; + } + else if (shndx == SHN_ABS) { + name = "ABSOLUTE"; + } + else if (shndx == SHN_COMMON) { + name = "COMMON"; + } + return name; +} + +static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) +{ + const char *name; + name = ""; + if (sym->st_name) { + name = sym_strtab + sym->st_name; + } + else { + name = sec_name(sym->st_shndx); + } + return name; +} + + + +#if BYTE_ORDER == LITTLE_ENDIAN +#define le16_to_cpu(val) (val) +#define le32_to_cpu(val) (val) +#endif +#if BYTE_ORDER == BIG_ENDIAN +#define le16_to_cpu(val) bswap_16(val) +#define le32_to_cpu(val) bswap_32(val) +#endif + +static uint16_t elf16_to_cpu(uint16_t val) +{ + return le16_to_cpu(val); +} + +static uint32_t elf32_to_cpu(uint32_t val) +{ + return le32_to_cpu(val); +} + +static void read_ehdr(FILE *fp) +{ + if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { + die("Cannot read ELF header: %s\n", + strerror(errno)); + } + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { + die("No ELF magic\n"); + } + if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { + die("Not a 32 bit executable\n"); + } + if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { + die("Not a LSB ELF executable\n"); + } + if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { + die("Unknown ELF version\n"); + } + /* Convert the fields to native endian */ + ehdr.e_type = elf16_to_cpu(ehdr.e_type); + ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); + ehdr.e_version = elf32_to_cpu(ehdr.e_version); + ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); + ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); + ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); + ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); + ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); + ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); + ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); + ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); + ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); + ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { + die("Unsupported ELF header type\n"); + } + if (ehdr.e_machine != EM_386) { + die("Not for x86\n"); + } + if (ehdr.e_version != EV_CURRENT) { + die("Unknown ELF version\n"); + } + if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { + die("Bad Elf header size\n"); + } + if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { + die("Bad program header entry\n"); + } + if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { + die("Bad section header entry\n"); + } + if (ehdr.e_shstrndx >= ehdr.e_shnum) { + die("String table index out of bounds\n"); + } +} + +static void read_shdrs(FILE *fp) +{ + int i; + Elf32_Shdr shdr; + + secs = calloc(ehdr.e_shnum, sizeof(struct section)); + if (!secs) { + die("Unable to allocate %d section headers\n", + ehdr.e_shnum); + } + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + ehdr.e_shoff, strerror(errno)); + } + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (fread(&shdr, sizeof shdr, 1, fp) != 1) + die("Cannot read ELF section headers %d/%d: %s\n", + i, ehdr.e_shnum, strerror(errno)); + sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); + sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); + sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); + sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); + sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); + sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); + sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); + sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); + sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); + sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); + if (sec->shdr.sh_link < ehdr.e_shnum) + sec->link = &secs[sec->shdr.sh_link]; + } + +} + +static void read_strtabs(FILE *fp) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_STRTAB) { + continue; + } + sec->strtab = malloc(sec->shdr.sh_size); + if (!sec->strtab) { + die("malloc of %d bytes for strtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + } +} + +static void read_symtabs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sec->symtab = malloc(sec->shdr.sh_size); + if (!sec->symtab) { + die("malloc of %d bytes for symtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym = &sec->symtab[j]; + sym->st_name = elf32_to_cpu(sym->st_name); + sym->st_value = elf32_to_cpu(sym->st_value); + sym->st_size = elf32_to_cpu(sym->st_size); + sym->st_shndx = elf16_to_cpu(sym->st_shndx); + } + } +} + + +static void read_relocs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec->reltab = malloc(sec->shdr.sh_size); + if (!sec->reltab) { + die("malloc of %d bytes for relocs failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel = &sec->reltab[j]; + rel->r_offset = elf32_to_cpu(rel->r_offset); + rel->r_info = elf32_to_cpu(rel->r_info); + } + } +} + + +static void print_absolute_symbols(void) +{ + int i; + printf("Absolute symbols\n"); + printf(" Num: Value Size Type Bind Visibility Name\n"); + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + char *sym_strtab; + int j; + + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sym_strtab = sec->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym; + const char *name; + sym = &sec->symtab[j]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + printf("%5d %08x %5d %10s %10s %12s %s\n", + j, sym->st_value, sym->st_size, + sym_type(ELF32_ST_TYPE(sym->st_info)), + sym_bind(ELF32_ST_BIND(sym->st_info)), + sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), + name); + } + } + printf("\n"); +} + +static void print_absolute_relocs(void) +{ + int i, printed = 0; + + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + struct section *sec_applies, *sec_symtab; + char *sym_strtab; + Elf32_Sym *sh_symtab; + int j; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + const char *name; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + + /* Absolute symbols are not relocated if bzImage is + * loaded at a non-compiled address. Display a warning + * to user at compile time about the absolute + * relocations present. + * + * User need to audit the code to make sure + * some symbols which should have been section + * relative have not become absolute because of some + * linker optimization or wrong programming usage. + * + * Before warning check if this absolute symbol + * relocation is harmless. + */ + if (is_reloc(S_ABS, name) || is_reloc(S_REL, name)) + continue; + + if (!printed) { + printf("WARNING: Absolute relocations" + " present\n"); + printf("Offset Info Type Sym.Value " + "Sym.Name\n"); + printed = 1; + } + + printf("%08x %08x %10s %08x %s\n", + rel->r_offset, + rel->r_info, + rel_type(ELF32_R_TYPE(rel->r_info)), + sym->st_value, + name); + } + } + + if (printed) + printf("\n"); +} + +static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), + int use_real_mode) +{ + int i; + /* Walk through the relocations */ + for (i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + struct section *sec_applies, *sec_symtab; + int j; + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + unsigned r_type; + const char *symname; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + r_type = ELF32_R_TYPE(rel->r_info); + + switch (r_type) { + case R_386_NONE: + case R_386_PC32: + case R_386_PC16: + case R_386_PC8: + /* + * NONE can be ignored and and PC relative + * relocations don't need to be adjusted. + */ + break; + + case R_386_16: + symname = sym_name(sym_strtab, sym); + if (!use_real_mode) + goto bad; + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_SEG, symname)) + goto bad; + } else { + if (is_reloc(S_LIN, symname)) + goto bad; + else + break; + } + visit(rel, sym); + break; + + case R_386_32: + symname = sym_name(sym_strtab, sym); + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_REL, symname)) + goto bad; + } else { + if (use_real_mode && + !is_reloc(S_LIN, symname)) + break; + } + visit(rel, sym); + break; + default: + die("Unsupported relocation type: %s (%d)\n", + rel_type(r_type), r_type); + break; + bad: + symname = sym_name(sym_strtab, sym); + die("Invalid %s relocation: %s\n", + rel_type(r_type), symname); + } + } + } +} + +static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + reloc16_count++; + else + reloc_count++; +} + +static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + /* Remember the address that needs to be adjusted. */ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + relocs16[reloc16_idx++] = rel->r_offset; + else + relocs[reloc_idx++] = rel->r_offset; +} + +static int cmp_relocs(const void *va, const void *vb) +{ + const unsigned long *a, *b; + a = va; b = vb; + return (*a == *b)? 0 : (*a > *b)? 1 : -1; +} + +static int write32(unsigned int v, FILE *f) +{ + unsigned char buf[4]; + + put_unaligned_le32(v, buf); + return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode) +{ + int i; + /* Count how many relocations I have and allocate space for them. */ + reloc_count = 0; + walk_relocs(count_reloc, use_real_mode); + relocs = malloc(reloc_count * sizeof(relocs[0])); + if (!relocs) { + die("malloc of %d entries for relocs failed\n", + reloc_count); + } + + relocs16 = malloc(reloc16_count * sizeof(relocs[0])); + if (!relocs16) { + die("malloc of %d entries for relocs16 failed\n", + reloc16_count); + } + /* Collect up the relocations */ + reloc_idx = 0; + walk_relocs(collect_reloc, use_real_mode); + + if (reloc16_count && !use_real_mode) + die("Segment relocations found but --realmode not specified\n"); + + /* Order the relocations for more efficient processing */ + qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); + qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs); + + /* Print the relocations */ + if (as_text) { + /* Print the relocations in a form suitable that + * gas will like. + */ + printf(".section \".data.reloc\",\"a\"\n"); + printf(".balign 4\n"); + if (use_real_mode) { + printf("\t.long %lu\n", reloc16_count); + for (i = 0; i < reloc16_count; i++) + printf("\t.long 0x%08lx\n", relocs16[i]); + printf("\t.long %lu\n", reloc_count); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } else { + /* Print a stop */ + printf("\t.long 0x%08lx\n", (unsigned long)0); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } + + printf("\n"); + } + else { + if (use_real_mode) { + write32(reloc16_count, stdout); + for (i = 0; i < reloc16_count; i++) + write32(relocs16[i], stdout); + write32(reloc_count, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) + write32(relocs[i], stdout); + } else { + /* Print a stop */ + write32(0, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) { + write32(relocs[i], stdout); + } + } + } +} + +static void usage(void) +{ + die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); +} + +int main(int argc, char **argv) +{ + int show_absolute_syms, show_absolute_relocs; + int as_text, use_real_mode; + const char *fname; + FILE *fp; + int i; + + show_absolute_syms = 0; + show_absolute_relocs = 0; + as_text = 0; + use_real_mode = 0; + fname = NULL; + for (i = 1; i < argc; i++) { + char *arg = argv[i]; + if (*arg == '-') { + if (strcmp(arg, "--abs-syms") == 0) { + show_absolute_syms = 1; + continue; + } + if (strcmp(arg, "--abs-relocs") == 0) { + show_absolute_relocs = 1; + continue; + } + if (strcmp(arg, "--text") == 0) { + as_text = 1; + continue; + } + if (strcmp(arg, "--realmode") == 0) { + use_real_mode = 1; + continue; + } + } + else if (!fname) { + fname = arg; + continue; + } + usage(); + } + if (!fname) { + usage(); + } + regex_init(use_real_mode); + fp = fopen(fname, "r"); + if (!fp) { + die("Cannot open %s: %s\n", + fname, strerror(errno)); + } + read_ehdr(fp); + read_shdrs(fp); + read_strtabs(fp); + read_symtabs(fp); + read_relocs(fp); + if (show_absolute_syms) { + print_absolute_symbols(); + return 0; + } + if (show_absolute_relocs) { + print_absolute_relocs(); + return 0; + } + emit_relocs(as_text, use_real_mode); + return 0; +} -- cgit v1.2.3 From b3266bd6ff52efb9e57c7fbfff4c8f7363dfaab3 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:25 +0300 Subject: x86, realmode: realmode.bin infrastructure Create realmode.bin and realmode.relocs files. Piggy pack them into relocatable object that will be included into .init.data section of the main kernel image. The first file includes binary image of the real-mode code. The latter file includes all relocations. The layout of the binary image is specified in realmode.lds.S. The makefile generates pa_ prefixed symbols for each exported global. These are used in 32-bit code and in realmode header to define symbols that need to be relocated. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-3-git-send-email-jarkko.sakkinen@intel.com Originally-by: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/Kbuild | 2 +- arch/x86/realmode/Makefile | 20 +++++++++++ arch/x86/realmode/rm/.gitignore | 3 ++ arch/x86/realmode/rm/Makefile | 63 ++++++++++++++++++++++++++++++++++ arch/x86/realmode/rm/header.S | 16 +++++++++ arch/x86/realmode/rm/realmode.lds.S | 68 +++++++++++++++++++++++++++++++++++++ arch/x86/realmode/rmpiggy.S | 18 ++++++++++ 7 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 arch/x86/realmode/Makefile create mode 100644 arch/x86/realmode/rm/.gitignore create mode 100644 arch/x86/realmode/rm/Makefile create mode 100644 arch/x86/realmode/rm/header.S create mode 100644 arch/x86/realmode/rm/realmode.lds.S create mode 100644 arch/x86/realmode/rmpiggy.S (limited to 'arch/x86') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 0e9dec6cadd1..e5287d8517aa 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -1,4 +1,3 @@ - obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support @@ -7,6 +6,7 @@ obj-$(CONFIG_XEN) += xen/ # lguest paravirtualization support obj-$(CONFIG_LGUEST_GUEST) += lguest/ +obj-y += realmode/ obj-y += kernel/ obj-y += mm/ diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile new file mode 100644 index 000000000000..f22a4f8d99d6 --- /dev/null +++ b/arch/x86/realmode/Makefile @@ -0,0 +1,20 @@ +# +# arch/x86/realmode/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# + +subdir- := rm + +obj-y += rmpiggy.o + +$(obj)/rmpiggy.o: $(obj)/rm/realmode.relocs $(obj)/rm/realmode.bin + +$(obj)/rm/realmode.bin: FORCE + $(Q)$(MAKE) $(build)=$(obj)/rm $@ + +$(obj)/rm/realmode.relocs: FORCE + $(Q)$(MAKE) $(build)=$(obj)/rm $@ diff --git a/arch/x86/realmode/rm/.gitignore b/arch/x86/realmode/rm/.gitignore new file mode 100644 index 000000000000..b6ed3a2555cb --- /dev/null +++ b/arch/x86/realmode/rm/.gitignore @@ -0,0 +1,3 @@ +pasyms.h +realmode.lds +realmode.relocs diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile new file mode 100644 index 000000000000..7c3f202cbccf --- /dev/null +++ b/arch/x86/realmode/rm/Makefile @@ -0,0 +1,63 @@ +# +# arch/x86/realmode/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# + +subdir- := wakeup + +always := realmode.bin + +realmode-y += header.o + +targets += $(realmode-y) + +REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) + +sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' + +quiet_cmd_pasyms = PASYMS $@ + cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \ + sed $(sed-pasyms) | sort | uniq > $@ + +$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE + $(call if_changed,pasyms) + +$(obj)/realmode.lds: $(obj)/pasyms.h + +LDFLAGS_realmode.elf := --emit-relocs -T +CPPFLAGS_realmode.lds += -P -C -I$(obj) + +$(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE + $(call if_changed,ld) + +OBJCOPYFLAGS_realmode.bin := -O binary + +$(obj)/realmode.bin: $(obj)/realmode.elf + $(call if_changed,objcopy) + +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = scripts/x86-relocs --realmode $< > $@ +$(obj)/realmode.relocs: $(obj)/realmode.elf FORCE + $(call if_changed,relocs) + +# --------------------------------------------------------------------------- + +# How to compile the 16-bit code. Note we always compile for -march=i386, +# that way we can complain to the user if the CPU is insufficient. +KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \ + -DDISABLE_BRANCH_PROFILING \ + -Wall -Wstrict-prototypes \ + -march=i386 -mregparm=3 \ + -include $(srctree)/$(src)/../../boot/code16gcc.h \ + -fno-strict-aliasing -fomit-frame-pointer \ + $(call cc-option, -ffreestanding) \ + $(call cc-option, -fno-toplevel-reorder,\ + $(call cc-option, -fno-unit-at-a-time)) \ + $(call cc-option, -fno-stack-protector) \ + $(call cc-option, -mpreferred-stack-boundary=2) +KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ +GCOV_PROFILE := n diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S new file mode 100644 index 000000000000..7be17f2c65a3 --- /dev/null +++ b/arch/x86/realmode/rm/header.S @@ -0,0 +1,16 @@ +/* + * Real-mode blob header; this should match realmode.h and be + * readonly; for mutable data instead add pointers into the .data + * or .bss sections as appropriate. + */ + +#include +#include + + .section ".header", "a" + +ENTRY(real_mode_header) + .long pa_text_start + .long pa_ro_end + .long pa_end +END(real_mode_header) diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S new file mode 100644 index 000000000000..c5b8a4f31ba3 --- /dev/null +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -0,0 +1,68 @@ +/* + * realmode.lds.S + * + * Linker script for the real-mode code + */ + +#include + +#undef i386 + +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_ARCH(i386) + +SECTIONS +{ + real_mode_seg = 0; + + . = 0; + .header : { + pa_real_mode_base = .; + *(.header) + } + + . = ALIGN(4); + .rodata : { + *(.rodata) + *(.rodata.*) + } + + . = ALIGN(PAGE_SIZE); + .text : { + pa_text_start = .; + *(.text) + *(.text.*) + } + + .text32 : { + *(.text32) + *(.text32.*) + pa_ro_end = .; + } + + . = ALIGN(PAGE_SIZE); + .data : { + *(.data) + *(.data.*) + } + + . = ALIGN(128); + .bss : { + *(.bss*) + } + + /* End signature for integrity checking */ + . = ALIGN(4); + .signature : { + *(.signature) + pa_end = .; + } + + /DISCARD/ : { + *(.note*) + *(.debug*) + *(.eh_frame*) + } + +#include "pasyms.h" +} diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S new file mode 100644 index 000000000000..6047d7f604cf --- /dev/null +++ b/arch/x86/realmode/rmpiggy.S @@ -0,0 +1,18 @@ +/* + * Wrapper script for the realmode binary as a transport object + * before copying to low memory. + */ +#include +#include + + .section ".init.data","aw" + + .balign PAGE_SIZE + +ENTRY(real_mode_blob) + .incbin "arch/x86/realmode/rm/realmode.bin" +END(real_mode_blob) + +ENTRY(real_mode_relocs) + .incbin "arch/x86/realmode/rm/realmode.relocs" +END(real_mode_relocs) -- cgit v1.2.3 From 084ee1c641a068bfd1194d545f7dc9ab2043eb35 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:26 +0300 Subject: x86, realmode: Relocator for realmode code Implements relocator for real mode code that is called as part of setup_arch(). Processes segment relocations and linear relocations. Real-mode code is relocated to a free hole below 1 MB. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-4-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 26 ++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/realmode.c | 79 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 2 ++ 4 files changed, 108 insertions(+) create mode 100644 arch/x86/include/asm/realmode.h create mode 100644 arch/x86/kernel/realmode.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h new file mode 100644 index 000000000000..dc1bba534c14 --- /dev/null +++ b/arch/x86/include/asm/realmode.h @@ -0,0 +1,26 @@ +#ifndef _ARCH_X86_REALMODE_H +#define _ARCH_X86_REALMODE_H + +#include +#include + +/* This must match data at realmode.S */ +struct real_mode_header { + u32 text_start; + u32 ro_end; + u32 end; +} __attribute__((__packed__)); + +extern struct real_mode_header real_mode_header; +extern unsigned char *real_mode_base; + +extern unsigned long init_rsp; +extern unsigned long initial_code; +extern unsigned long initial_gs; + +extern unsigned char real_mode_blob[]; +extern unsigned char real_mode_relocs[]; + +extern void __init setup_real_mode(void); + +#endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 532d2e090e6f..f9e19d4eb984 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,6 +36,7 @@ obj-y += pci-iommu_table.o obj-y += resource.o obj-y += trampoline.o trampoline_$(BITS).o +obj-y += realmode.o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c new file mode 100644 index 000000000000..7415c42547ac --- /dev/null +++ b/arch/x86/kernel/realmode.c @@ -0,0 +1,79 @@ +#include +#include + +#include +#include +#include + +unsigned char *real_mode_base; +struct real_mode_header real_mode_header; + +void __init setup_real_mode(void) +{ + phys_addr_t mem; + u16 real_mode_seg; + u32 *rel; + u32 count; + u32 *ptr; + u16 *seg; + int i; + + struct real_mode_header *header = + (struct real_mode_header *) real_mode_blob; + + size_t size = PAGE_ALIGN(header->end); + + /* Has to be in very low memory so we can execute real-mode AP code. */ + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + if (!mem) + panic("Cannot allocate trampoline\n"); + + real_mode_base = __va(mem); + memblock_reserve(mem, size); + + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + real_mode_base, (unsigned long long)mem, size); + + memcpy(real_mode_base, real_mode_blob, size); + + real_mode_seg = __pa(real_mode_base) >> 4; + rel = (u32 *) real_mode_relocs; + + /* 16-bit segment relocations. */ + count = rel[0]; + rel = &rel[1]; + for (i = 0; i < count; i++) { + seg = (u16 *) (real_mode_base + rel[i]); + *seg = real_mode_seg; + } + + /* 32-bit linear relocations. */ + count = rel[i]; + rel = &rel[i + 1]; + for (i = 0; i < count; i++) { + ptr = (u32 *) (real_mode_base + rel[i]); + *ptr += __pa(real_mode_base); + } + + /* Copied header will contain relocated physical addresses. */ + memcpy(&real_mode_header, real_mode_base, + sizeof(struct real_mode_header)); +} + +/* + * set_real_mode_permissions() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. + */ +static int __init set_real_mode_permissions(void) +{ + size_t all_size = + PAGE_ALIGN(real_mode_header.end) - + __pa(real_mode_base); + + set_memory_x((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); + return 0; +} + +arch_initcall(set_real_mode_permissions); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..56e41242a6b8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -918,6 +919,7 @@ void __init setup_arch(char **cmdline_p) max_pfn_mapped< Date: Tue, 8 May 2012 21:22:27 +0300 Subject: x86, realmode: Move reboot_32.S to unified realmode code Migrated reboot_32.S from x86_trampoline to the real-mode blob. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-5-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 4 ++ arch/x86/kernel/Makefile | 1 - arch/x86/kernel/reboot.c | 25 +------- arch/x86/kernel/reboot_32.S | 135 --------------------------------------- arch/x86/realmode/rm/Makefile | 1 + arch/x86/realmode/rm/header.S | 3 + arch/x86/realmode/rm/reboot_32.S | 134 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 145 insertions(+), 158 deletions(-) delete mode 100644 arch/x86/kernel/reboot_32.S create mode 100644 arch/x86/realmode/rm/reboot_32.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index dc1bba534c14..bf26b0681931 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -9,6 +9,10 @@ struct real_mode_header { u32 text_start; u32 ro_end; u32 end; + /* reboot */ +#ifdef CONFIG_X86_32 + u32 machine_real_restart_asm; +#endif } __attribute__((__packed__)); extern struct real_mode_header real_mode_header; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f9e19d4eb984..b71ef35c7d77 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -49,7 +49,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o -obj-$(CONFIG_X86_32) += reboot_32.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d840e69a853c..050eff29a4bb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,6 +24,7 @@ #ifdef CONFIG_X86_32 # include # include +# include #else # include #endif @@ -332,15 +333,10 @@ static int __init reboot_init(void) } core_initcall(reboot_init); -extern const unsigned char machine_real_restart_asm[]; -extern const u64 machine_real_restart_gdt[3]; - void machine_real_restart(unsigned int type) { - void *restart_va; - unsigned long restart_pa; - void (*restart_lowmem)(unsigned int); - u64 *lowmem_gdt; + void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) + real_mode_header.machine_real_restart_asm; local_irq_disable(); @@ -369,21 +365,6 @@ void machine_real_restart(unsigned int type) too. */ *((unsigned short *)0x472) = reboot_mode; - /* Patch the GDT in the low memory trampoline */ - lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); - - restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); - restart_pa = virt_to_phys(restart_va); - restart_lowmem = (void (*)(unsigned int))restart_pa; - - /* GDT[0]: GDT self-pointer */ - lowmem_gdt[0] = - (u64)(sizeof(machine_real_restart_gdt) - 1) + - ((u64)virt_to_phys(lowmem_gdt) << 16); - /* GDT[1]: 64K real mode code segment */ - lowmem_gdt[1] = - GDT_ENTRY(0x009b, restart_pa, 0xffff); - /* Jump to the identity-mapped low memory code */ restart_lowmem(type); } diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S deleted file mode 100644 index 1d5c46df0d78..000000000000 --- a/arch/x86/kernel/reboot_32.S +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include -#include -#include - -/* - * The following code and data reboots the machine by switching to real - * mode and jumping to the BIOS reset entry point, as if the CPU has - * really been reset. The previous version asked the keyboard - * controller to pulse the CPU reset line, which is more thorough, but - * doesn't work with at least one type of 486 motherboard. It is easy - * to stop this code working; hence the copious comments. - * - * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. - */ - .section ".x86_trampoline","a" - .balign 16 - .code32 -ENTRY(machine_real_restart_asm) -r_base = . - /* Get our own relocated address */ - call 1f -1: popl %ebx - subl $(1b - r_base), %ebx - - /* Compute the equivalent real-mode segment */ - movl %ebx, %ecx - shrl $4, %ecx - - /* Patch post-real-mode segment jump */ - movw (dispatch_table - r_base)(%ebx,%eax,2),%ax - movw %ax, (101f - r_base)(%ebx) - movw %cx, (102f - r_base)(%ebx) - - /* Set up the IDT for real mode. */ - lidtl (machine_real_restart_idt - r_base)(%ebx) - - /* - * Set up a GDT from which we can load segment descriptors for real - * mode. The GDT is not used in real mode; it is just needed here to - * prepare the descriptors. - */ - lgdtl (machine_real_restart_gdt - r_base)(%ebx) - - /* - * Load the data segment registers with 16-bit compatible values - */ - movl $16, %ecx - movl %ecx, %ds - movl %ecx, %es - movl %ecx, %fs - movl %ecx, %gs - movl %ecx, %ss - ljmpl $8, $1f - r_base - -/* - * This is 16-bit protected mode code to disable paging and the cache, - * switch to real mode and jump to the BIOS reset code. - * - * The instruction that switches to real mode by writing to CR0 must be - * followed immediately by a far jump instruction, which set CS to a - * valid value for real mode, and flushes the prefetch queue to avoid - * running instructions that have already been decoded in protected - * mode. - * - * Clears all the flags except ET, especially PG (paging), PE - * (protected-mode enable) and TS (task switch for coprocessor state - * save). Flushes the TLB after paging has been disabled. Sets CD and - * NW, to disable the cache on a 486, and invalidates the cache. This - * is more like the state of a 486 after reset. I don't know if - * something else should be done for other chips. - * - * More could be done here to set up the registers as if a CPU reset had - * occurred; hopefully real BIOSs don't assume much. This is not the - * actual BIOS entry point, anyway (that is at 0xfffffff0). - * - * Most of this work is probably excessive, but it is what is tested. - */ - .code16 -1: - xorl %ecx, %ecx - movl %cr0, %eax - andl $0x00000011, %eax - orl $0x60000000, %eax - movl %eax, %cr0 - movl %ecx, %cr3 - movl %cr0, %edx - andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ - jz 2f - wbinvd -2: - andb $0x10, %al - movl %eax, %cr0 - .byte 0xea /* ljmpw */ -101: .word 0 /* Offset */ -102: .word 0 /* Segment */ - -bios: - ljmpw $0xf000, $0xfff0 - -apm: - movw $0x1000, %ax - movw %ax, %ss - movw $0xf000, %sp - movw $0x5307, %ax - movw $0x0001, %bx - movw $0x0003, %cx - int $0x15 - -END(machine_real_restart_asm) - - .balign 16 - /* These must match +#include +#include +#include + +/* + * The following code and data reboots the machine by switching to real + * mode and jumping to the BIOS reset entry point, as if the CPU has + * really been reset. The previous version asked the keyboard + * controller to pulse the CPU reset line, which is more thorough, but + * doesn't work with at least one type of 486 motherboard. It is easy + * to stop this code working; hence the copious comments. + * + * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + */ + .section ".text32", "ax" + .code32 + .globl machine_real_restart_asm + + .balign 16 +machine_real_restart_asm: + /* Set up the IDT for real mode. */ + lidtl pa_machine_real_restart_idt + + /* + * Set up a GDT from which we can load segment descriptors for real + * mode. The GDT is not used in real mode; it is just needed here to + * prepare the descriptors. + */ + lgdtl pa_machine_real_restart_gdt + + /* + * Load the data segment registers with 16-bit compatible values + */ + movl $16, %ecx + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + ljmpw $8, $1f + +/* + * This is 16-bit protected mode code to disable paging and the cache, + * switch to real mode and jump to the BIOS reset code. + * + * The instruction that switches to real mode by writing to CR0 must be + * followed immediately by a far jump instruction, which set CS to a + * valid value for real mode, and flushes the prefetch queue to avoid + * running instructions that have already been decoded in protected + * mode. + * + * Clears all the flags except ET, especially PG (paging), PE + * (protected-mode enable) and TS (task switch for coprocessor state + * save). Flushes the TLB after paging has been disabled. Sets CD and + * NW, to disable the cache on a 486, and invalidates the cache. This + * is more like the state of a 486 after reset. I don't know if + * something else should be done for other chips. + * + * More could be done here to set up the registers as if a CPU reset had + * occurred; hopefully real BIOSs don't assume much. This is not the + * actual BIOS entry point, anyway (that is at 0xfffffff0). + * + * Most of this work is probably excessive, but it is what is tested. + */ + .text + .code16 + + .balign 16 +machine_real_restart_asm16: +1: + xorl %ecx, %ecx + movl %cr0, %edx + andl $0x00000011, %edx + orl $0x60000000, %edx + movl %edx, %cr0 + movl %ecx, %cr3 + movl %cr0, %edx + andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + jz 2f + wbinvd +2: + andb $0x10, %dl + movl %edx, %cr0 + .byte 0xea /* ljmpw */ + .word 3f /* Offset */ + .word real_mode_seg /* Segment */ + +3: + testb $0, %al + jz bios + +apm: + movw $0x1000, %ax + movw %ax, %ss + movw $0xf000, %sp + movw $0x5307, %ax + movw $0x0001, %bx + movw $0x0003, %cx + int $0x15 + /* This should never return... */ + +bios: + ljmpw $0xf000, $0xfff0 + + .section ".rodata", "a" + .globl machine_real_restart_idt, machine_real_restart_gdt + + .balign 16 +machine_real_restart_idt: + .word 0xffff /* Length - real mode default value */ + .long 0 /* Base - real mode default value */ + + .balign 16 +machine_real_restart_gdt: + /* Self-pointer */ + .word 0xffff /* Length - real mode default value */ + .long pa_machine_real_restart_gdt + .word 0 + + /* + * 16-bit code segment pointing to real_mode_seg + * Selector value 8 + */ + .word 0xffff /* Limit */ + .long 0x9b000000 + pa_real_mode_base + .word 0 + + /* + * 16-bit data segment with the selector value 16 = 0x10 and + * base value 0x100; since this is consistent with real mode + * semantics we don't have to reload the segments once CR0.PE = 0. + */ + .quad GDT_ENTRY(0x0093, 0x100, 0xffff) -- cgit v1.2.3 From 48927bbb97c7d4cf343c05827ab9ac30c60678cb Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:28 +0300 Subject: x86, realmode: Move SMP trampoline to unified realmode code Migrated SMP trampoline code to the real mode blob. SMP trampoline code is not yet removed from .x86_trampoline because it is needed by the wakeup code. [ hpa: always enable compiling startup_32_smp in head_32.S... it is only a few instructions which go into .init on UP builds, and it makes the rest of the code less #ifdef ugly. ] Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-6-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 18 ++++ arch/x86/kernel/head_32.S | 5 +- arch/x86/kernel/head_64.S | 4 - arch/x86/kernel/realmode.c | 14 +++ arch/x86/kernel/smpboot.c | 18 ++-- arch/x86/realmode/rm/Makefile | 1 + arch/x86/realmode/rm/header.S | 11 +++ arch/x86/realmode/rm/trampoline_32.S | 86 +++++++++++++++++ arch/x86/realmode/rm/trampoline_64.S | 175 +++++++++++++++++++++++++++++++++++ 9 files changed, 316 insertions(+), 16 deletions(-) create mode 100644 arch/x86/realmode/rm/trampoline_32.S create mode 100644 arch/x86/realmode/rm/trampoline_64.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index bf26b0681931..9b4a5da5e22e 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -12,6 +12,17 @@ struct real_mode_header { /* reboot */ #ifdef CONFIG_X86_32 u32 machine_real_restart_asm; +#endif + /* SMP trampoline */ + u32 trampoline_data; + u32 trampoline_status; +#ifdef CONFIG_X86_32 + u32 startup_32_smp; + u32 boot_gdt; +#else + u32 startup_64_smp; + u32 level3_ident_pgt; + u32 level3_kernel_pgt; #endif } __attribute__((__packed__)); @@ -25,6 +36,13 @@ extern unsigned long initial_gs; extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; +#ifdef CONFIG_X86_32 +extern unsigned char startup_32_smp[]; +extern unsigned char boot_gdt[]; +#else +extern unsigned char secondary_startup_64[]; +#endif + extern void __init setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ce0be7cd085e..a3c2b4ffebc6 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -273,10 +273,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ - __CPUINIT - -#ifdef CONFIG_SMP ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -287,7 +284,7 @@ ENTRY(startup_32_smp) movl pa(stack_start),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp -#endif /* CONFIG_SMP */ + default_entry: /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 40f4eb3766d1..d70bc2eb202b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -136,10 +136,6 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Fixup trampoline */ - addq %rbp, trampoline_level4_pgt + 0(%rip) - addq %rbp, trampoline_level4_pgt + (511*8)(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with * zeros. Better take a jmp than relying on empty space being * filled with 0x90 (nop) diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 7415c42547ac..a465775b32f2 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -58,6 +58,20 @@ void __init setup_real_mode(void) /* Copied header will contain relocated physical addresses. */ memcpy(&real_mode_header, real_mode_base, sizeof(struct real_mode_header)); + +#ifdef CONFIG_X86_32 + *((u32 *)__va(real_mode_header.startup_32_smp)) = __pa(startup_32_smp); + *((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt); +#else + *((u64 *) __va(real_mode_header.startup_64_smp)) = + (u64) __pa(secondary_startup_64); + + *((u64 *) __va(real_mode_header.level3_ident_pgt)) = + __pa(level3_ident_pgt) + _KERNPG_TABLE; + + *((u64 *) __va(real_mode_header.level3_kernel_pgt)) = + __pa(level3_kernel_pgt) + _KERNPG_TABLE; +#endif } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6e1e406038c2..c7971ea74bd0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include #include @@ -73,6 +73,8 @@ #include #include +#include + /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -662,8 +664,12 @@ static void __cpuinit announce_cpu(int cpu, int apicid) */ static int __cpuinit do_boot_cpu(int apicid, int cpu) { + volatile u32 *trampoline_status = + (volatile u32 *) __va(real_mode_header.trampoline_status); + /* start_ip had better be page-aligned! */ + unsigned long start_ip = real_mode_header.trampoline_data; + unsigned long boot_error = 0; - unsigned long start_ip; int timeout; struct create_idle c_idle = { .cpu = cpu, @@ -713,9 +719,6 @@ do_rest: initial_code = (unsigned long)start_secondary; stack_start = c_idle.idle->thread.sp; - /* start_ip had better be page-aligned! */ - start_ip = trampoline_address(); - /* So we see what's up */ announce_cpu(cpu, apicid); @@ -778,8 +781,7 @@ do_rest: pr_debug("CPU%d: has booted.\n", cpu); } else { boot_error = 1; - if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) - == 0xA5A5A5A5) + if (*trampoline_status == 0xA5A5A5A5) /* trampoline started but...? */ pr_err("CPU%d: Stuck ??\n", cpu); else @@ -805,7 +807,7 @@ do_rest: } /* mark "stuck" area as not stuck */ - *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; + *trampoline_status = 0; if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { /* diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 3f851c488593..56ec64f94e69 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -13,6 +13,7 @@ always := realmode.bin realmode-y += header.o realmode-$(CONFIG_X86_32) += reboot_32.o +realmode-y += trampoline_$(BITS).o targets += $(realmode-y) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index db21401c0c57..a97900409c61 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -15,5 +15,16 @@ ENTRY(real_mode_header) .long pa_end #ifdef CONFIG_X86_32 .long pa_machine_real_restart_asm +#endif + /* SMP trampoline */ + .long pa_trampoline_data + .long pa_trampoline_status +#ifdef CONFIG_X86_32 + .long pa_startup_32_smp + .long pa_boot_gdt +#else + .long pa_startup_64_smp + .long pa_level3_ident_pgt + .long pa_level3_kernel_pgt #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S new file mode 100644 index 000000000000..18cb7fc9fad4 --- /dev/null +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -0,0 +1,86 @@ +/* + * + * Trampoline.S Derived from Setup.S by Linus Torvalds + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * + * This is only used for booting secondary CPUs in SMP machine + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * We jump into arch/x86/kernel/head_32.S. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, we load CS to the physical segment + * of the real mode code before doing anything further. + * + * The structure real_mode_header includes entries that need + * to be set up before executing this code: + * + * startup_32_smp + * boot_gdt + */ + +#include +#include +#include +#include + + .text + .code16 + .globl trampoline_data + + .balign PAGE_SIZE +trampoline_data: + wbinvd # Needed for NUMA-Q should be harmless for others + + .byte 0xea # ljmpw + .word 1f # Offset + .word real_mode_seg # Segment +1: + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + + cli # We should be safe anyway + + movl $0xA5A5A5A5, trampoline_status + # write marker for master knows we're running + + /* GDT tables in non default location kernel can be beyond 16MB and + * lgdt will not be able to load the address as in real mode default + * operand size is 16bit. Use lgdtl instead to force operand size + * to 32 bit. + */ + + lidtl boot_idt_descr # load idt with 0, 0 + lgdtl boot_gdt_descr # load gdt with whatever is appropriate + + xor %ax, %ax + inc %ax # protected mode (PE) bit + lmsw %ax # into protected mode + + # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S + ljmpl *(startup_32_smp) + + .data + .globl startup_32_smp, boot_gdt, trampoline_status + +boot_gdt_descr: + .word __BOOT_DS + 7 # gdt limit +boot_gdt: + .long 0 # gdt base + +boot_idt_descr: + .word 0 # idt limit = 0 + .long 0 # idt base = 0L + +trampoline_status: + .long 0 + +startup_32_smp: + .long 0x00000000 + .word __BOOT_CS, 0 diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S new file mode 100644 index 000000000000..063da008d520 --- /dev/null +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -0,0 +1,175 @@ +/* + * + * Trampoline.S Derived from Setup.S by Linus Torvalds + * + * 4 Jan 1997 Michael Chastain: changed to gnu as. + * 15 Sept 2005 Eric Biederman: 64bit PIC support + * + * Entry: CS:IP point to the start of our code, we are + * in real mode with no stack, but the rest of the + * trampoline page to make our stack and everything else + * is a mystery. + * + * On entry to trampoline_data, the processor is in real mode + * with 16-bit addressing and 16-bit data. CS has some value + * and IP is zero. Thus, data addresses need to be absolute + * (no relocation) and are taken with regard to r_base. + * + * With the addition of trampoline_level4_pgt this code can + * now enter a 64bit kernel that lives at arbitrary 64bit + * physical addresses. + * + * If you work on this file, check the object module with objdump + * --full-contents --reloc to make sure there are no relocation + * entries. + */ + +#include +#include +#include +#include +#include +#include +#include + + .text + .balign PAGE_SIZE + .code16 + +ENTRY(trampoline_data) + cli # We should be safe anyway + wbinvd + + .byte 0xea # ljmpw + .word 1f # Offset + .word real_mode_seg # Segment +1: + mov %cs, %ax # Code and data in the same place + mov %ax, %ds + mov %ax, %es + mov %ax, %ss + + movl $0xA5A5A5A5, trampoline_status + # write marker for master knows we're running + + # Setup stack + movw $trampoline_stack_end, %sp + + call verify_cpu # Verify the cpu supports long mode + testl %eax, %eax # Check for return code + jnz no_longmode + + /* + * GDT tables in non default location kernel can be beyond 16MB and + * lgdt will not be able to load the address as in real mode default + * operand size is 16bit. Use lgdtl instead to force operand size + * to 32 bit. + */ + + lidtl tidt # load idt with 0, 0 + lgdtl tgdt # load gdt with whatever is appropriate + + mov $X86_CR0_PE, %ax # protected mode (PE) bit + lmsw %ax # into protected mode + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector) + +no_longmode: + hlt + jmp no_longmode +#include "../kernel/verify_cpu.S" + + .code32 + .balign 4 +ENTRY(startup_32) + movl $__KERNEL_DS, %eax # Initialize the %ds segment register + movl %eax, %ds + + movl $X86_CR4_PAE, %eax + movl %eax, %cr4 # Enable PAE mode + + movl pa_startup_64_smp, %esi + movl pa_startup_64_smp_high, %edi + + # Setup trampoline 4 level pagetables + leal pa_trampoline_level4_pgt, %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + movl $(1 << _EFER_LME), %eax # Enable Long Mode + xorl %edx, %edx + wrmsr + + # Enable paging and in turn activate Long Mode + # Enable protected mode + movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + ljmpl *(pa_startup_64_vector) + + .code64 + .balign 4 +ENTRY(startup_64) + # Now jump into the kernel using virtual addresses + movl %edi, %eax + shlq $32, %rax + addl %esi, %eax + jmp *%rax + + # Careful these need to be in the same 64K segment as the above; +tidt: + .word 0 # idt limit = 0 + .word 0, 0 # idt base = 0L + + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 4 + .globl tgdt +tgdt: + .short tgdt_end - tgdt # gdt limit + .long pa_tgdt + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tgdt_end: + + .balign 4 +startup_32_vector: + .long pa_startup_32 + .word __KERNEL32_CS, 0 + + .balign 4 + .globl startup_64_vector +startup_64_vector: + .long pa_startup_64 + .word __KERNEL_CS, 0 + + .data + + .balign 4 +ENTRY(trampoline_status) + .long 0 + +trampoline_stack: + .org 0x1000 +trampoline_stack_end: + + .globl level3_ident_pgt + .globl level3_kernel_pgt +ENTRY(trampoline_level4_pgt) + level3_ident_pgt: .quad 0 + .fill 510,8,0 + level3_kernel_pgt: .quad 0 + + .globl startup_64_smp + .globl startup_64_smp_high +startup_64_smp: .long 0 +startup_64_smp_high: .long 0 -- cgit v1.2.3 From c9b77ccb52a5c77233b0e557b7d4417b00ef4012 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:29 +0300 Subject: x86, realmode: Move ACPI wakeup to unified realmode code Migrated ACPI wakeup code to the real-mode blob. Code existing in .x86_trampoline can be completely removed. Static descriptor table in wakeup_asm.S is courtesy of H. Peter Anvin. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-7-git-send-email-jarkko.sakkinen@intel.com Cc: Rafael J. Wysocki Cc: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 2 - arch/x86/include/asm/realmode.h | 4 + arch/x86/include/asm/trampoline.h | 39 ------ arch/x86/kernel/Makefile | 1 - arch/x86/kernel/acpi/Makefile | 9 +- arch/x86/kernel/acpi/realmode/.gitignore | 3 - arch/x86/kernel/acpi/realmode/Makefile | 59 --------- arch/x86/kernel/acpi/realmode/bioscall.S | 1 - arch/x86/kernel/acpi/realmode/copy.S | 1 - arch/x86/kernel/acpi/realmode/regs.c | 1 - arch/x86/kernel/acpi/realmode/video-bios.c | 1 - arch/x86/kernel/acpi/realmode/video-mode.c | 1 - arch/x86/kernel/acpi/realmode/video-vesa.c | 1 - arch/x86/kernel/acpi/realmode/video-vga.c | 1 - arch/x86/kernel/acpi/realmode/wakemain.c | 81 ------------- arch/x86/kernel/acpi/realmode/wakeup.S | 170 -------------------------- arch/x86/kernel/acpi/realmode/wakeup.h | 48 -------- arch/x86/kernel/acpi/realmode/wakeup.lds.S | 62 ---------- arch/x86/kernel/acpi/sleep.c | 33 +---- arch/x86/kernel/acpi/sleep.h | 2 +- arch/x86/kernel/acpi/wakeup_rm.S | 12 -- arch/x86/kernel/head32.c | 1 - arch/x86/kernel/head64.c | 1 - arch/x86/kernel/mpparse.c | 1 - arch/x86/kernel/setup.c | 2 - arch/x86/kernel/tboot.c | 5 +- arch/x86/kernel/trampoline.c | 42 ------- arch/x86/kernel/trampoline_32.S | 83 ------------- arch/x86/kernel/trampoline_64.S | 171 -------------------------- arch/x86/kernel/vmlinux.lds.S | 12 -- arch/x86/realmode/rm/Makefile | 4 + arch/x86/realmode/rm/header.S | 5 + arch/x86/realmode/rm/realmode.lds.S | 4 + arch/x86/realmode/rm/wakeup/.gitignore | 3 + arch/x86/realmode/rm/wakeup/Makefile | 33 +++++ arch/x86/realmode/rm/wakeup/bioscall.S | 1 + arch/x86/realmode/rm/wakeup/copy.S | 1 + arch/x86/realmode/rm/wakeup/regs.c | 1 + arch/x86/realmode/rm/wakeup/video-bios.c | 1 + arch/x86/realmode/rm/wakeup/video-mode.c | 1 + arch/x86/realmode/rm/wakeup/video-vesa.c | 1 + arch/x86/realmode/rm/wakeup/video-vga.c | 1 + arch/x86/realmode/rm/wakeup/wakemain.c | 82 +++++++++++++ arch/x86/realmode/rm/wakeup/wakeup.h | 41 +++++++ arch/x86/realmode/rm/wakeup/wakeup_asm.S | 189 +++++++++++++++++++++++++++++ drivers/acpi/sleep.c | 8 +- 46 files changed, 386 insertions(+), 840 deletions(-) delete mode 100644 arch/x86/include/asm/trampoline.h delete mode 100644 arch/x86/kernel/acpi/realmode/.gitignore delete mode 100644 arch/x86/kernel/acpi/realmode/Makefile delete mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S delete mode 100644 arch/x86/kernel/acpi/realmode/copy.S delete mode 100644 arch/x86/kernel/acpi/realmode/regs.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-bios.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-mode.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-vesa.c delete mode 100644 arch/x86/kernel/acpi/realmode/video-vga.c delete mode 100644 arch/x86/kernel/acpi/realmode/wakemain.c delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.S delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.h delete mode 100644 arch/x86/kernel/acpi/realmode/wakeup.lds.S delete mode 100644 arch/x86/kernel/acpi/wakeup_rm.S delete mode 100644 arch/x86/kernel/trampoline.c delete mode 100644 arch/x86/kernel/trampoline_32.S delete mode 100644 arch/x86/kernel/trampoline_64.S create mode 100644 arch/x86/realmode/rm/wakeup/.gitignore create mode 100644 arch/x86/realmode/rm/wakeup/Makefile create mode 100644 arch/x86/realmode/rm/wakeup/bioscall.S create mode 100644 arch/x86/realmode/rm/wakeup/copy.S create mode 100644 arch/x86/realmode/rm/wakeup/regs.c create mode 100644 arch/x86/realmode/rm/wakeup/video-bios.c create mode 100644 arch/x86/realmode/rm/wakeup/video-mode.c create mode 100644 arch/x86/realmode/rm/wakeup/video-vesa.c create mode 100644 arch/x86/realmode/rm/wakeup/video-vga.c create mode 100644 arch/x86/realmode/rm/wakeup/wakemain.c create mode 100644 arch/x86/realmode/rm/wakeup/wakeup.h create mode 100644 arch/x86/realmode/rm/wakeup/wakeup_asm.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 610001d385dd..724aa441de7d 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -29,7 +29,6 @@ #include #include #include -#include #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -118,7 +117,6 @@ static inline void acpi_disable_pci(void) extern int acpi_suspend_lowlevel(void); extern const unsigned char acpi_wakeup_code[]; -#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code))) /* early initialization routine */ extern void acpi_reserve_wakeup_memory(void); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 9b4a5da5e22e..1bfc74d213a4 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -24,6 +24,10 @@ struct real_mode_header { u32 level3_ident_pgt; u32 level3_kernel_pgt; #endif +#ifdef CONFIG_ACPI_SLEEP + u32 wakeup_start; + u32 wakeup_header; +#endif } __attribute__((__packed__)); extern struct real_mode_header real_mode_header; diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h deleted file mode 100644 index feca3118a73b..000000000000 --- a/arch/x86/include/asm/trampoline.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _ASM_X86_TRAMPOLINE_H -#define _ASM_X86_TRAMPOLINE_H - -#ifndef __ASSEMBLY__ - -#include -#include - -/* - * Trampoline 80x86 program as an array. These are in the init rodata - * segment, but that's okay, because we only care about the relative - * addresses of the symbols. - */ -extern const unsigned char x86_trampoline_start []; -extern const unsigned char x86_trampoline_end []; -extern unsigned char *x86_trampoline_base; - -extern unsigned long init_rsp; -extern unsigned long initial_code; -extern unsigned long initial_gs; - -extern void __init setup_trampolines(void); - -extern const unsigned char trampoline_data[]; -extern const unsigned char trampoline_status[]; - -#define TRAMPOLINE_SYM(x) \ - ((void *)(x86_trampoline_base + \ - ((const unsigned char *)(x) - x86_trampoline_start))) - -/* Address of the SMP trampoline */ -static inline unsigned long trampoline_address(void) -{ - return virt_to_phys(TRAMPOLINE_SYM(trampoline_data)); -} - -#endif /* __ASSEMBLY__ */ - -#endif /* _ASM_X86_TRAMPOLINE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b71ef35c7d77..4a20f4441ffe 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-y += trampoline.o trampoline_$(BITS).o obj-y += realmode.o obj-y += process.o obj-y += i387.o xsave.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 6f35260bb3ef..163b22581472 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,14 +1,7 @@ -subdir- := realmode - obj-$(CONFIG_ACPI) += boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o endif -$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin - -$(obj)/realmode/wakeup.bin: FORCE - $(Q)$(MAKE) $(build)=$(obj)/realmode - diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore deleted file mode 100644 index 58f1f48a58f8..000000000000 --- a/arch/x86/kernel/acpi/realmode/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -wakeup.bin -wakeup.elf -wakeup.lds diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile deleted file mode 100644 index 6a564ac67ef5..000000000000 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -# -# arch/x86/kernel/acpi/realmode/Makefile -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# - -always := wakeup.bin -targets := wakeup.elf wakeup.lds - -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o - -# The link order of the video-*.o modules can matter. In particular, -# video-vga.o *must* be listed first, followed by video-vesa.o. -# Hardware-specific drivers should follow in the order they should be -# probed, and video-bios.o should typically be last. -wakeup-y += video-vga.o -wakeup-y += video-vesa.o -wakeup-y += video-bios.o - -targets += $(wakeup-y) - -bootsrc := $(src)/../../../boot - -# --------------------------------------------------------------------------- - -# How to compile the 16-bit code. Note we always compile for -march=i386, -# that way we can complain to the user if the CPU is insufficient. -# Compile with _SETUP since this is similar to the boot-time setup code. -KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \ - -I$(srctree)/$(bootsrc) \ - $(cflags-y) \ - -Wall -Wstrict-prototypes \ - -march=i386 -mregparm=3 \ - -include $(srctree)/$(bootsrc)/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) -KBUILD_CFLAGS += $(call cc-option, -m32) -KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -GCOV_PROFILE := n - -WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) - -LDFLAGS_wakeup.elf := -T - -CPPFLAGS_wakeup.lds += -P -C - -$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE - $(call if_changed,ld) - -OBJCOPYFLAGS_wakeup.bin := -O binary - -$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE - $(call if_changed,objcopy) diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S deleted file mode 100644 index f51eb0bb56ce..000000000000 --- a/arch/x86/kernel/acpi/realmode/bioscall.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S deleted file mode 100644 index dc59ebee69d8..000000000000 --- a/arch/x86/kernel/acpi/realmode/copy.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/copy.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c deleted file mode 100644 index 6206033ba202..000000000000 --- a/arch/x86/kernel/acpi/realmode/regs.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/regs.c" diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c deleted file mode 100644 index 7deabc144a27..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-bios.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-bios.c" diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c deleted file mode 100644 index 328ad209f113..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-mode.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-mode.c" diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c deleted file mode 100644 index 9dbb9672226a..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-vesa.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vesa.c" diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c deleted file mode 100644 index bcc81255f374..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-vga.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vga.c" diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c deleted file mode 100644 index 883962d9eef2..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakemain.c +++ /dev/null @@ -1,81 +0,0 @@ -#include "wakeup.h" -#include "boot.h" - -static void udelay(int loops) -{ - while (loops--) - io_delay(); /* Approximately 1 us */ -} - -static void beep(unsigned int hz) -{ - u8 enable; - - if (!hz) { - enable = 0x00; /* Turn off speaker */ - } else { - u16 div = 1193181/hz; - - outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ - io_delay(); - outb(div, 0x42); /* LSB of counter */ - io_delay(); - outb(div >> 8, 0x42); /* MSB of counter */ - io_delay(); - - enable = 0x03; /* Turn on speaker */ - } - inb(0x61); /* Dummy read of System Control Port B */ - io_delay(); - outb(enable, 0x61); /* Enable timer 2 output to speaker */ - io_delay(); -} - -#define DOT_HZ 880 -#define DASH_HZ 587 -#define US_PER_DOT 125000 - -/* Okay, this is totally silly, but it's kind of fun. */ -static void send_morse(const char *pattern) -{ - char s; - - while ((s = *pattern++)) { - switch (s) { - case '.': - beep(DOT_HZ); - udelay(US_PER_DOT); - beep(0); - udelay(US_PER_DOT); - break; - case '-': - beep(DASH_HZ); - udelay(US_PER_DOT * 3); - beep(0); - udelay(US_PER_DOT); - break; - default: /* Assume it's a space */ - udelay(US_PER_DOT * 3); - break; - } - } -} - -void main(void) -{ - /* Kill machine if structures are wrong */ - if (wakeup_header.real_magic != 0x12345678) - while (1); - - if (wakeup_header.realmode_flags & 4) - send_morse("...-"); - - if (wakeup_header.realmode_flags & 1) - asm volatile("lcallw $0xc000,$3"); - - if (wakeup_header.realmode_flags & 2) { - /* Need to call BIOS */ - probe_cards(0); - set_mode(wakeup_header.video_mode); - } -} diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S deleted file mode 100644 index b4fd836e4053..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ /dev/null @@ -1,170 +0,0 @@ -/* - * ACPI wakeup real mode startup stub - */ -#include -#include -#include -#include -#include -#include "wakeup.h" - - .code16 - .section ".jump", "ax" - .globl _start -_start: - cli - jmp wakeup_code - -/* This should match the structure in wakeup.h */ - .section ".header", "a" - .globl wakeup_header -wakeup_header: -video_mode: .short 0 /* Video mode number */ -pmode_return: .byte 0x66, 0xea /* ljmpl */ - .long 0 /* offset goes here */ - .short __KERNEL_CS -pmode_cr0: .long 0 /* Saved %cr0 */ -pmode_cr3: .long 0 /* Saved %cr3 */ -pmode_cr4: .long 0 /* Saved %cr4 */ -pmode_efer: .quad 0 /* Saved EFER */ -pmode_gdt: .quad 0 -pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ -pmode_behavior: .long 0 /* Wakeup behavior flags */ -realmode_flags: .long 0 -real_magic: .long 0 -trampoline_segment: .word 0 -_pad1: .byte 0 -wakeup_jmp: .byte 0xea /* ljmpw */ -wakeup_jmp_off: .word 3f -wakeup_jmp_seg: .word 0 -wakeup_gdt: .quad 0, 0, 0 -signature: .long WAKEUP_HEADER_SIGNATURE - - .text - .code16 -wakeup_code: - cld - - /* Apparently some dimwit BIOS programmers don't know how to - program a PM to RM transition, and we might end up here with - junk in the data segment descriptor registers. The only way - to repair that is to go into PM and fix it ourselves... */ - movw $16, %cx - lgdtl %cs:wakeup_gdt - movl %cr0, %eax - orb $X86_CR0_PE, %al - movl %eax, %cr0 - jmp 1f -1: ljmpw $8, $2f -2: - movw %cx, %ds - movw %cx, %es - movw %cx, %ss - movw %cx, %fs - movw %cx, %gs - - andb $~X86_CR0_PE, %al - movl %eax, %cr0 - jmp wakeup_jmp -3: - /* Set up segments */ - movw %cs, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - lidtl wakeup_idt - - movl $wakeup_stack_end, %esp - - /* Clear the EFLAGS */ - pushl $0 - popfl - - /* Check header signature... */ - movl signature, %eax - cmpl $WAKEUP_HEADER_SIGNATURE, %eax - jne bogus_real_magic - - /* Check we really have everything... */ - movl end_signature, %eax - cmpl $WAKEUP_END_SIGNATURE, %eax - jne bogus_real_magic - - /* Call the C code */ - calll main - - /* Restore MISC_ENABLE before entering protected mode, in case - BIOS decided to clear XD_DISABLE during S3. */ - movl pmode_behavior, %eax - btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax - jnc 1f - - movl pmode_misc_en, %eax - movl pmode_misc_en + 4, %edx - movl $MSR_IA32_MISC_ENABLE, %ecx - wrmsr -1: - - /* Do any other stuff... */ - -#ifndef CONFIG_64BIT - /* This could also be done in C code... */ - movl pmode_cr3, %eax - movl %eax, %cr3 - - movl pmode_cr4, %ecx - jecxz 1f - movl %ecx, %cr4 -1: - movl pmode_efer, %eax - movl pmode_efer + 4, %edx - movl %eax, %ecx - orl %edx, %ecx - jz 1f - movl $MSR_EFER, %ecx - wrmsr -1: - - lgdtl pmode_gdt - - /* This really couldn't... */ - movl pmode_cr0, %eax - movl %eax, %cr0 - jmp pmode_return -#else - pushw $0 - pushw trampoline_segment - pushw $0 - lret -#endif - -bogus_real_magic: -1: - hlt - jmp 1b - - .data - .balign 8 - - /* This is the standard real-mode IDT */ -wakeup_idt: - .word 0xffff /* limit */ - .long 0 /* address */ - .word 0 - - .globl HEAP, heap_end -HEAP: - .long wakeup_heap -heap_end: - .long wakeup_stack - - .bss -wakeup_heap: - .space 2048 -wakeup_stack: - .space 2048 -wakeup_stack_end: - - .section ".signature","a" -end_signature: - .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h deleted file mode 100644 index 97a29e1430e3..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Definitions for the wakeup data structure at the head of the - * wakeup code. - */ - -#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H -#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H - -#ifndef __ASSEMBLY__ -#include - -/* This must match data at wakeup.S */ -struct wakeup_header { - u16 video_mode; /* Video mode number */ - u16 _jmp1; /* ljmpl opcode, 32-bit only */ - u32 pmode_entry; /* Protected mode resume point, 32-bit only */ - u16 _jmp2; /* CS value, 32-bit only */ - u32 pmode_cr0; /* Protected mode cr0 */ - u32 pmode_cr3; /* Protected mode cr3 */ - u32 pmode_cr4; /* Protected mode cr4 */ - u32 pmode_efer_low; /* Protected mode EFER */ - u32 pmode_efer_high; - u64 pmode_gdt; - u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ - u32 pmode_misc_en_high; - u32 pmode_behavior; /* Wakeup routine behavior flags */ - u32 realmode_flags; - u32 real_magic; - u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ - u8 _pad1; - u8 wakeup_jmp; - u16 wakeup_jmp_off; - u16 wakeup_jmp_seg; - u64 wakeup_gdt[3]; - u32 signature; /* To check we have correct structure */ -} __attribute__((__packed__)); - -extern struct wakeup_header wakeup_header; -#endif - -#define WAKEUP_HEADER_OFFSET 8 -#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 -#define WAKEUP_END_SIGNATURE 0x65a22c82 - -/* Wakeup behavior bits */ -#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 - -#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S deleted file mode 100644 index d4f8010a5b1b..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ /dev/null @@ -1,62 +0,0 @@ -/* - * wakeup.ld - * - * Linker script for the real-mode wakeup code - */ -#undef i386 -#include "wakeup.h" - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(_start) - -SECTIONS -{ - . = 0; - .jump : { - *(.jump) - } = 0x90909090 - - . = WAKEUP_HEADER_OFFSET; - .header : { - *(.header) - } - - . = ALIGN(16); - .text : { - *(.text*) - } = 0x90909090 - - . = ALIGN(16); - .rodata : { - *(.rodata*) - } - - .videocards : { - video_cards = .; - *(.videocards) - video_cards_end = .; - } - - . = ALIGN(16); - .data : { - *(.data*) - } - - . = ALIGN(16); - .bss : { - __bss_start = .; - *(.bss) - __bss_end = .; - } - - .signature : { - *(.signature) - } - - _end = .; - - /DISCARD/ : { - *(.note*) - } -} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 146a49c763a4..d941b62da4b6 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -14,8 +14,9 @@ #include #include #include +#include -#include "realmode/wakeup.h" +#include "../../realmode/rm/wakeup/wakeup.h" #include "sleep.h" unsigned long acpi_realmode_flags; @@ -36,13 +37,9 @@ asmlinkage void acpi_enter_s3(void) */ int acpi_suspend_lowlevel(void) { - struct wakeup_header *header; - /* address in low memory of the wakeup routine. */ - char *acpi_realmode; + struct wakeup_header *header = + (struct wakeup_header *) __va(real_mode_header.wakeup_header); - acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code); - - header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET); if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); return -EINVAL; @@ -50,27 +47,6 @@ int acpi_suspend_lowlevel(void) header->video_mode = saved_video_mode; - header->wakeup_jmp_seg = acpi_wakeup_address >> 4; - - /* - * Set up the wakeup GDT. We set these up as Big Real Mode, - * that is, with limits set to 4 GB. At least the Lenovo - * Thinkpad X61 is known to need this for the video BIOS - * initialization quirk to work; this is likely to also - * be the case for other laptops or integrated video devices. - */ - - /* GDT[0]: GDT self-pointer */ - header->wakeup_gdt[0] = - (u64)(sizeof(header->wakeup_gdt) - 1) + - ((u64)__pa(&header->wakeup_gdt) << 16); - /* GDT[1]: big real mode-like code segment */ - header->wakeup_gdt[1] = - GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); - /* GDT[2]: big real mode-like data segment */ - header->wakeup_gdt[2] = - GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff); - #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); @@ -95,7 +71,6 @@ int acpi_suspend_lowlevel(void) header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ - header->trampoline_segment = trampoline_address() >> 4; #ifdef CONFIG_SMP stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index d68677a2a010..5653a5791ec9 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -2,8 +2,8 @@ * Variables and functions used by the code in sleep.c */ -#include #include +#include extern unsigned long saved_video_mode; extern long saved_magic; diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S deleted file mode 100644 index 63b8ab524f2c..000000000000 --- a/arch/x86/kernel/acpi/wakeup_rm.S +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Wrapper script for the realmode binary as a transport object - * before copying to low memory. - */ -#include - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .globl acpi_wakeup_code -acpi_wakeup_code: - .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" - .size acpi_wakeup_code, .-acpi_wakeup_code diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 51ff18616d50..c18f59d10101 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 3a3b779f41d3..037df57a99ac 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,7 +24,6 @@ #include #include #include -#include #include static void __init zap_identity_mappings(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ca470e4c92dc..f44d31157353 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 56e41242a6b8..7a14fece9cfc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -73,7 +73,6 @@ #include #include -#include #include #include #include @@ -918,7 +917,6 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", max_pfn_mapped< #include -#include +#include #include #include #include @@ -201,7 +201,8 @@ static int tboot_setup_sleep(void) add_mac_region(e820.map[i].addr, e820.map[i].size); } - tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + tboot->acpi_sinfo.kernel_s3_resume_vector = + real_mode_header.wakeup_start; return 0; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c deleted file mode 100644 index a73b61055ad6..000000000000 --- a/arch/x86/kernel/trampoline.c +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include - -#include -#include -#include - -unsigned char *x86_trampoline_base; - -void __init setup_trampolines(void) -{ - phys_addr_t mem; - size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); - - /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); - - x86_trampoline_base = __va(mem); - memblock_reserve(mem, size); - - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - x86_trampoline_base, (unsigned long long)mem, size); - - memcpy(x86_trampoline_base, x86_trampoline_start, size); -} - -/* - * setup_trampolines() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page - * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. - */ -static int __init configure_trampolines(void) -{ - size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); - - set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT); - return 0; -} -arch_initcall(configure_trampolines); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S deleted file mode 100644 index 451c0a7ef7fd..000000000000 --- a/arch/x86/kernel/trampoline_32.S +++ /dev/null @@ -1,83 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * - * This is only used for booting secondary CPUs in SMP machine - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * We jump into arch/x86/kernel/head_32.S. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * If you work on this file, check the object module with - * objdump --reloc to make sure there are no relocation - * entries except for: - * - * TYPE VALUE - * R_386_32 startup_32_smp - * R_386_32 boot_gdt - */ - -#include -#include -#include -#include - -#ifdef CONFIG_SMP - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .code16 - -ENTRY(trampoline_data) -r_base = . - wbinvd # Needed for NUMA-Q should be harmless for others - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - - cli # We should be safe anyway - - movl $0xA5A5A5A5, trampoline_status - r_base - # write marker for master knows we're running - - /* GDT tables in non default location kernel can be beyond 16MB and - * lgdt will not be able to load the address as in real mode default - * operand size is 16bit. Use lgdtl instead to force operand size - * to 32 bit. - */ - - lidtl boot_idt_descr - r_base # load idt with 0, 0 - lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate - - xor %ax, %ax - inc %ax # protected mode (PE) bit - lmsw %ax # into protected mode - # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S - ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) - - # These need to be in the same 64K segment as the above; - # hence we don't use the boot_gdt_descr defined in head.S -boot_gdt_descr: - .word __BOOT_DS + 7 # gdt limit - .long boot_gdt - __PAGE_OFFSET # gdt base - -boot_idt_descr: - .word 0 # idt limit = 0 - .long 0 # idt base = 0L - -ENTRY(trampoline_status) - .long 0 - -.globl trampoline_end -trampoline_end: - -#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S deleted file mode 100644 index 09ff51799e96..000000000000 --- a/arch/x86/kernel/trampoline_64.S +++ /dev/null @@ -1,171 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * 15 Sept 2005 Eric Biederman: 64bit PIC support - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * With the addition of trampoline_level4_pgt this code can - * now enter a 64bit kernel that lives at arbitrary 64bit - * physical addresses. - * - * If you work on this file, check the object module with objdump - * --full-contents --reloc to make sure there are no relocation - * entries. - */ - -#include -#include -#include -#include -#include -#include -#include - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .code16 - -ENTRY(trampoline_data) -r_base = . - cli # We should be safe anyway - wbinvd - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - mov %ax, %es - mov %ax, %ss - - - movl $0xA5A5A5A5, trampoline_status - r_base - # write marker for master knows we're running - - # Setup stack - movw $(trampoline_stack_end - r_base), %sp - - call verify_cpu # Verify the cpu supports long mode - testl %eax, %eax # Check for return code - jnz no_longmode - - mov %cs, %ax - movzx %ax, %esi # Find the 32bit trampoline location - shll $4, %esi - - # Fixup the absolute vectors - leal (startup_32 - r_base)(%esi), %eax - movl %eax, startup_32_vector - r_base - leal (startup_64 - r_base)(%esi), %eax - movl %eax, startup_64_vector - r_base - leal (tgdt - r_base)(%esi), %eax - movl %eax, (tgdt + 2 - r_base) - - /* - * GDT tables in non default location kernel can be beyond 16MB and - * lgdt will not be able to load the address as in real mode default - * operand size is 16bit. Use lgdtl instead to force operand size - * to 32 bit. - */ - - lidtl tidt - r_base # load idt with 0, 0 - lgdtl tgdt - r_base # load gdt with whatever is appropriate - - mov $X86_CR0_PE, %ax # protected mode (PE) bit - lmsw %ax # into protected mode - - # flush prefetch and jump to startup_32 - ljmpl *(startup_32_vector - r_base) - - .code32 - .balign 4 -startup_32: - movl $__KERNEL_DS, %eax # Initialize the %ds segment register - movl %eax, %ds - - movl $X86_CR4_PAE, %eax - movl %eax, %cr4 # Enable PAE mode - - # Setup trampoline 4 level pagetables - leal (trampoline_level4_pgt - r_base)(%esi), %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - movl $(1 << _EFER_LME), %eax # Enable Long Mode - xorl %edx, %edx - wrmsr - - # Enable paging and in turn activate Long Mode - # Enable protected mode - movl $(X86_CR0_PG | X86_CR0_PE), %eax - movl %eax, %cr0 - - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - ljmp *(startup_64_vector - r_base)(%esi) - - .code64 - .balign 4 -startup_64: - # Now jump into the kernel using virtual addresses - movq $secondary_startup_64, %rax - jmp *%rax - - .code16 -no_longmode: - hlt - jmp no_longmode -#include "verify_cpu.S" - - .balign 4 - # Careful these need to be in the same 64K segment as the above; -tidt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - - # Duplicate the global descriptor table - # so the kernel can live anywhere - .balign 4 -tgdt: - .short tgdt_end - tgdt # gdt limit - .long tgdt - r_base - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tgdt_end: - - .balign 4 -startup_32_vector: - .long startup_32 - r_base - .word __KERNEL32_CS, 0 - - .balign 4 -startup_64_vector: - .long startup_64 - r_base - .word __KERNEL_CS, 0 - - .balign 4 -ENTRY(trampoline_status) - .long 0 - -trampoline_stack: - .org 0x1000 -trampoline_stack_end: -ENTRY(trampoline_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 510,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - -ENTRY(trampoline_end) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0f703f10901a..22a1530146a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -197,18 +197,6 @@ SECTIONS INIT_DATA_SECTION(16) - /* - * Code and data for a variety of lowlevel trampolines, to be - * copied into base memory (< 1 MiB) during initialization. - * Since it is copied early, the main copy can be discarded - * afterwards. - */ - .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) { - x86_trampoline_start = .; - *(.x86_trampoline) - x86_trampoline_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 56ec64f94e69..2432acb6b04f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -14,9 +14,13 @@ always := realmode.bin realmode-y += header.o realmode-$(CONFIG_X86_32) += reboot_32.o realmode-y += trampoline_$(BITS).o +realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o targets += $(realmode-y) +$(obj)/wakeup/wakeup.o: FORCE + $(Q)$(MAKE) $(build)=$(obj)/wakeup $@ + REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index a97900409c61..730b1316c099 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -26,5 +26,10 @@ ENTRY(real_mode_header) .long pa_startup_64_smp .long pa_level3_ident_pgt .long pa_level3_kernel_pgt +#endif + /* ACPI sleep */ +#ifdef CONFIG_ACPI_SLEEP + .long pa_wakeup_start + .long pa_wakeup_header #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index c5b8a4f31ba3..91b83ea55c37 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -25,6 +25,10 @@ SECTIONS .rodata : { *(.rodata) *(.rodata.*) + . = ALIGN(16); + video_cards = .; + *(.videocards) + video_cards_end = .; } . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/realmode/rm/wakeup/.gitignore b/arch/x86/realmode/rm/wakeup/.gitignore new file mode 100644 index 000000000000..58f1f48a58f8 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/.gitignore @@ -0,0 +1,3 @@ +wakeup.bin +wakeup.elf +wakeup.lds diff --git a/arch/x86/realmode/rm/wakeup/Makefile b/arch/x86/realmode/rm/wakeup/Makefile new file mode 100644 index 000000000000..4c8533240cdd --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/Makefile @@ -0,0 +1,33 @@ +# +# arch/x86/kernel/acpi/realmode/Makefile +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# + +always := wakeup.o + +wakeup-y += wakeup_asm.o wakemain.o video-mode.o +wakeup-y += copy.o bioscall.o regs.o + +# The link order of the video-*.o modules can matter. In particular, +# video-vga.o *must* be listed first, followed by video-vesa.o. +# Hardware-specific drivers should follow in the order they should be +# probed, and video-bios.o should typically be last. +wakeup-y += video-vga.o +wakeup-y += video-vesa.o +wakeup-y += video-bios.o + +targets += $(wakeup-y) + +WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) + +LDFLAGS_wakeup.o := -m elf_i386 -r +$(obj)/wakeup.o: $(WAKEUP_OBJS) FORCE + $(call if_changed,ld) + +bootsrc := $(src)/../../../boot + +ccflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) +asflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) diff --git a/arch/x86/realmode/rm/wakeup/bioscall.S b/arch/x86/realmode/rm/wakeup/bioscall.S new file mode 100644 index 000000000000..f51eb0bb56ce --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/realmode/rm/wakeup/copy.S b/arch/x86/realmode/rm/wakeup/copy.S new file mode 100644 index 000000000000..dc59ebee69d8 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/copy.S @@ -0,0 +1 @@ +#include "../../../boot/copy.S" diff --git a/arch/x86/realmode/rm/wakeup/regs.c b/arch/x86/realmode/rm/wakeup/regs.c new file mode 100644 index 000000000000..6206033ba202 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" diff --git a/arch/x86/realmode/rm/wakeup/video-bios.c b/arch/x86/realmode/rm/wakeup/video-bios.c new file mode 100644 index 000000000000..7deabc144a27 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-bios.c @@ -0,0 +1 @@ +#include "../../../boot/video-bios.c" diff --git a/arch/x86/realmode/rm/wakeup/video-mode.c b/arch/x86/realmode/rm/wakeup/video-mode.c new file mode 100644 index 000000000000..328ad209f113 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-mode.c @@ -0,0 +1 @@ +#include "../../../boot/video-mode.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vesa.c b/arch/x86/realmode/rm/wakeup/video-vesa.c new file mode 100644 index 000000000000..9dbb9672226a --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-vesa.c @@ -0,0 +1 @@ +#include "../../../boot/video-vesa.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vga.c b/arch/x86/realmode/rm/wakeup/video-vga.c new file mode 100644 index 000000000000..bcc81255f374 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/video-vga.c @@ -0,0 +1 @@ +#include "../../../boot/video-vga.c" diff --git a/arch/x86/realmode/rm/wakeup/wakemain.c b/arch/x86/realmode/rm/wakeup/wakemain.c new file mode 100644 index 000000000000..91405d515ec6 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/wakemain.c @@ -0,0 +1,82 @@ +#include "wakeup.h" +#include "boot.h" + +static void udelay(int loops) +{ + while (loops--) + io_delay(); /* Approximately 1 us */ +} + +static void beep(unsigned int hz) +{ + u8 enable; + + if (!hz) { + enable = 0x00; /* Turn off speaker */ + } else { + u16 div = 1193181/hz; + + outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ + io_delay(); + outb(div, 0x42); /* LSB of counter */ + io_delay(); + outb(div >> 8, 0x42); /* MSB of counter */ + io_delay(); + + enable = 0x03; /* Turn on speaker */ + } + inb(0x61); /* Dummy read of System Control Port B */ + io_delay(); + outb(enable, 0x61); /* Enable timer 2 output to speaker */ + io_delay(); +} + +#define DOT_HZ 880 +#define DASH_HZ 587 +#define US_PER_DOT 125000 + +/* Okay, this is totally silly, but it's kind of fun. */ +static void send_morse(const char *pattern) +{ + char s; + + while ((s = *pattern++)) { + switch (s) { + case '.': + beep(DOT_HZ); + udelay(US_PER_DOT); + beep(0); + udelay(US_PER_DOT); + break; + case '-': + beep(DASH_HZ); + udelay(US_PER_DOT * 3); + beep(0); + udelay(US_PER_DOT); + break; + default: /* Assume it's a space */ + udelay(US_PER_DOT * 3); + break; + } + } +} + +void main(void) +{ + /* Kill machine if structures are wrong */ + if (wakeup_header.real_magic != 0x12345678) + while (1) + ; + + if (wakeup_header.realmode_flags & 4) + send_morse("...-"); + + if (wakeup_header.realmode_flags & 1) + asm volatile("lcallw $0xc000,$3"); + + if (wakeup_header.realmode_flags & 2) { + /* Need to call BIOS */ + probe_cards(0); + set_mode(wakeup_header.video_mode); + } +} diff --git a/arch/x86/realmode/rm/wakeup/wakeup.h b/arch/x86/realmode/rm/wakeup/wakeup.h new file mode 100644 index 000000000000..2dfaf06b8af1 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/wakeup.h @@ -0,0 +1,41 @@ +/* + * Definitions for the wakeup data structure at the head of the + * wakeup code. + */ + +#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H +#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H + +#ifndef __ASSEMBLY__ +#include + +/* This must match data at wakeup.S */ +struct wakeup_header { + u16 video_mode; /* Video mode number */ + u32 pmode_entry; /* Protected mode resume point, 32-bit only */ + u16 pmode_cs; + u32 pmode_cr0; /* Protected mode cr0 */ + u32 pmode_cr3; /* Protected mode cr3 */ + u32 pmode_cr4; /* Protected mode cr4 */ + u32 pmode_efer_low; /* Protected mode EFER */ + u32 pmode_efer_high; + u64 pmode_gdt; + u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ + u32 pmode_misc_en_high; + u32 pmode_behavior; /* Wakeup routine behavior flags */ + u32 realmode_flags; + u32 real_magic; + u32 signature; /* To check we have correct structure */ +} __attribute__((__packed__)); + +extern struct wakeup_header wakeup_header; +#endif + +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 + +/* Wakeup behavior bits */ +#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 + +#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S new file mode 100644 index 000000000000..b61126cb599e --- /dev/null +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -0,0 +1,189 @@ +/* + * ACPI wakeup real mode startup stub + */ +#include +#include +#include +#include +#include +#include "wakeup.h" + + .code16 + +/* This should match the structure in wakeup.h */ + .section ".data", "aw" + .globl wakeup_header +wakeup_header: +video_mode: .short 0 /* Video mode number */ +pmode_entry: .long 0 +pmode_cs: .short __KERNEL_CS +pmode_cr0: .long 0 /* Saved %cr0 */ +pmode_cr3: .long 0 /* Saved %cr3 */ +pmode_cr4: .long 0 /* Saved %cr4 */ +pmode_efer: .quad 0 /* Saved EFER */ +pmode_gdt: .quad 0 +pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ +pmode_behavior: .long 0 /* Wakeup behavior flags */ +realmode_flags: .long 0 +real_magic: .long 0 +signature: .long WAKEUP_HEADER_SIGNATURE + .size wakeup_header, .-wakeup_header + + .text + .code16 + .globl wakeup_start +wakeup_start: + cli + cld + + .byte 0xea /* ljmpw */ + .word 3f + .word real_mode_seg +3: + /* Apparently some dimwit BIOS programmers don't know how to + program a PM to RM transition, and we might end up here with + junk in the data segment descriptor registers. The only way + to repair that is to go into PM and fix it ourselves... */ + movw $16, %cx + lgdtl %cs:wakeup_gdt + movl %cr0, %eax + orb $X86_CR0_PE, %al + movl %eax, %cr0 + ljmpw $8, $2f +2: + movw %cx, %ds + movw %cx, %es + movw %cx, %ss + movw %cx, %fs + movw %cx, %gs + + andb $~X86_CR0_PE, %al + movl %eax, %cr0 + .byte 0xea /* ljmpw */ + .word 3f + .word real_mode_seg +3: + /* Set up segments */ + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %ss + lidtl wakeup_idt + + movl $wakeup_stack_end, %esp + + /* Clear the EFLAGS */ + pushl $0 + popfl + + /* Check header signature... */ + movl signature, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax + jne bogus_real_magic + + /* Check we really have everything... */ + movl end_signature, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax + jne bogus_real_magic + + /* Call the C code */ + calll main + + /* Restore MISC_ENABLE before entering protected mode, in case + BIOS decided to clear XD_DISABLE during S3. */ + movl pmode_behavior, %eax + btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax + jnc 1f + + movl pmode_misc_en, %eax + movl pmode_misc_en + 4, %edx + movl $MSR_IA32_MISC_ENABLE, %ecx + wrmsr +1: + + /* Do any other stuff... */ + +#ifndef CONFIG_64BIT + /* This could also be done in C code... */ + movl pmode_cr3, %eax + movl %eax, %cr3 + + movl pmode_cr4, %ecx + jecxz 1f + movl %ecx, %cr4 +1: + movl pmode_efer, %eax + movl pmode_efer + 4, %edx + movl %eax, %ecx + orl %edx, %ecx + jz 1f + movl $MSR_EFER, %ecx + wrmsr +1: + + lgdtl pmode_gdt + + /* This really couldn't... */ + movl pmode_cr0, %eax + movl %eax, %cr0 + ljmpl *pmode_entry +#else + jmp trampoline_data +#endif + +bogus_real_magic: +1: + hlt + jmp 1b + + .section ".rodata","a" + + /* + * Set up the wakeup GDT. We set these up as Big Real Mode, + * that is, with limits set to 4 GB. At least the Lenovo + * Thinkpad X61 is known to need this for the video BIOS + * initialization quirk to work; this is likely to also + * be the case for other laptops or integrated video devices. + */ + + .globl wakeup_gdt + .balign 16 +wakeup_gdt: + .word 3*8-1 /* Self-descriptor */ + .long pa_wakeup_gdt + .word 0 + + .word 0xffff /* 16-bit code segment @ real_mode_base */ + .long 0x9b000000 + pa_real_mode_base + .word 0x008f /* big real mode */ + + .word 0xffff /* 16-bit data segment @ real_mode_base */ + .long 0x93000000 + pa_real_mode_base + .word 0x008f /* big real mode */ + .size wakeup_gdt, .-wakeup_gdt + + .data + .balign 8 + + /* This is the standard real-mode IDT */ +wakeup_idt: + .word 0xffff /* limit */ + .long 0 /* address */ + .word 0 + + .globl HEAP, heap_end +HEAP: + .long wakeup_heap +heap_end: + .long wakeup_stack + + .bss +wakeup_heap: + .space 2048 +wakeup_stack: + .space 2048 +wakeup_stack_end: + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index eb6fd233764b..e77aa4a1c9f6 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -25,6 +25,8 @@ #include #include +#include + #include "internal.h" #include "sleep.h" @@ -91,13 +93,13 @@ static struct notifier_block tts_notifier = { static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP + unsigned long wakeup_pa = real_mode_header.wakeup_start; /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { - if (!acpi_wakeup_address) { + if (!wakeup_pa) return -EFAULT; - } acpi_set_firmware_waking_vector( - (acpi_physical_address)acpi_wakeup_address); + (acpi_physical_address)wakeup_pa); } ACPI_FLUSH_CPU_CACHE(); -- cgit v1.2.3 From f156ffc439951b63cfa9f4d999a8d54267f13282 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:30 +0300 Subject: x86, realmode: Set permission for real mode pages Set proper permissions for rodata, text and data, removing the realmode trampoline area as a remaining RWX memory mapping in the kernel. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-8-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/realmode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index a465775b32f2..d85ac20bb4eb 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -86,7 +86,21 @@ static int __init set_real_mode_permissions(void) PAGE_ALIGN(real_mode_header.end) - __pa(real_mode_base); - set_memory_x((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); + size_t ro_size = + PAGE_ALIGN(real_mode_header.ro_end) - + __pa(real_mode_base); + + size_t text_size = + PAGE_ALIGN(real_mode_header.ro_end) - + real_mode_header.text_start; + + unsigned long text_start = + (unsigned long) __va(real_mode_header.text_start); + + set_memory_nx((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); + set_memory_ro((unsigned long) real_mode_base, ro_size >> PAGE_SHIFT); + set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); + return 0; } -- cgit v1.2.3 From 487f50ffeb142d8f86fff6e43a8852ce3d46c173 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:32 +0300 Subject: x86, realmode: Add .text64 section, make barrier symbols absolute Add a .text64 section. The purpose of this is to keep 16-, 32- and 64-bit code segregated into separate sections, mainly to keep disassembly sane. Move barrier symbols out of sections to avoid the "symbol in empty section" problem in some versions of GNU ld. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-10-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/realmode.lds.S | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 91b83ea55c37..4d4afcaf5f02 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -32,8 +32,8 @@ SECTIONS } . = ALIGN(PAGE_SIZE); + pa_text_start = .; .text : { - pa_text_start = .; *(.text) *(.text.*) } @@ -41,9 +41,14 @@ SECTIONS .text32 : { *(.text32) *(.text32.*) - pa_ro_end = .; } + .text64 : { + *(.text64) + *(.text64.*) + } + pa_ro_end = .; + . = ALIGN(PAGE_SIZE); .data : { *(.data) @@ -59,8 +64,8 @@ SECTIONS . = ALIGN(4); .signature : { *(.signature) - pa_end = .; } + pa_end = .; /DISCARD/ : { *(.note*) -- cgit v1.2.3 From 024742861124ef26dae4cfc620250f8f47ac934a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:33 +0300 Subject: x86, realmode: Move bits to the proper sections in trampoline_64.S Move various bits to the sections they really belong in in trampoline_64.S. Use GLOBAL() rather than ENTRY() for data objects: ENTRY() should only be used with code and forces alignment to 16 bytes. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-11-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_64.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 063da008d520..66c58cf15503 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -80,6 +80,7 @@ no_longmode: jmp no_longmode #include "../kernel/verify_cpu.S" + .section ".text32","ax" .code32 .balign 4 ENTRY(startup_32) @@ -114,6 +115,7 @@ ENTRY(startup_32) */ ljmpl *(pa_startup_64_vector) + .section ".text64","ax" .code64 .balign 4 ENTRY(startup_64) @@ -123,7 +125,8 @@ ENTRY(startup_64) addl %esi, %eax jmp *%rax - # Careful these need to be in the same 64K segment as the above; + .section ".rodata","a" + .balign 16 tidt: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L @@ -153,9 +156,8 @@ startup_64_vector: .word __KERNEL_CS, 0 .data - .balign 4 -ENTRY(trampoline_status) +GLOBAL(trampoline_status) .long 0 trampoline_stack: @@ -164,7 +166,7 @@ trampoline_stack_end: .globl level3_ident_pgt .globl level3_kernel_pgt -ENTRY(trampoline_level4_pgt) +GLOBAL(trampoline_level4_pgt) level3_ident_pgt: .quad 0 .fill 510,8,0 level3_kernel_pgt: .quad 0 -- cgit v1.2.3 From f7436a9da902922a48cccc208099763b87d6171f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:34 +0300 Subject: x86, realmode: Align .data section in trampoline_32.S Specify the alignment of the .data section in trampoline_32.S. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-12-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 18cb7fc9fad4..1f9e3316f73d 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -68,7 +68,7 @@ trampoline_data: .data .globl startup_32_smp, boot_gdt, trampoline_status - + .balign 4 boot_gdt_descr: .word __BOOT_DS + 7 # gdt limit boot_gdt: -- cgit v1.2.3 From 056a43a6d3ab903a798d8ee4435ad67d6fccc3e6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:35 +0300 Subject: x86, realmode: Remove indirect jumps in trampoline_64.S Remove indirect jumps in trampoline_64.S which are no longer necessary: the realmode code can relocate the absolute jumps correctly from the start. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-13-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_64.S | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 66c58cf15503..77b72b45d705 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -73,7 +73,7 @@ ENTRY(trampoline_data) lmsw %ax # into protected mode # flush prefetch and jump to startup_32 - ljmpl *(startup_32_vector) + ljmpl $__KERNEL32_CS, $pa_startup_32 no_longmode: hlt @@ -113,7 +113,7 @@ ENTRY(startup_32) * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use * the new gdt/idt that has __KERNEL_CS with CS.L = 1. */ - ljmpl *(pa_startup_64_vector) + ljmpl $__KERNEL_CS, $pa_startup_64 .section ".text64","ax" .code64 @@ -144,17 +144,6 @@ tgdt: .quad 0x00cf93000000ffff # __KERNEL_DS tgdt_end: - .balign 4 -startup_32_vector: - .long pa_startup_32 - .word __KERNEL32_CS, 0 - - .balign 4 - .globl startup_64_vector -startup_64_vector: - .long pa_startup_64 - .word __KERNEL_CS, 0 - .data .balign 4 GLOBAL(trampoline_status) -- cgit v1.2.3 From 968ff9ee56f1e3ed4ff4a6d10185865dc77d8f7e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:36 +0300 Subject: x86, realmode: Remove indirect jumps in trampoline_32 and wakeup_asm Remove indirect jumps in trampoline_32.S and the 32-bit part of wakeup_asm.S. There exist systems which are known to do weird things if an SMI comes in right after a mode switch, and the safest way to deal with it is to always follow with a simple absolute far jump. In the 64-bit code we then to a register indirect near jump; follow that pattern for the 32-bit code. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-14-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/trampoline_32.S | 22 +++++++++++++--------- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 8 +++++--- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 1f9e3316f73d..1315ef48dbf1 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -47,24 +47,29 @@ trampoline_data: cli # We should be safe anyway + movl startup_32_smp, %eax # where we need to go + movl $0xA5A5A5A5, trampoline_status # write marker for master knows we're running - /* GDT tables in non default location kernel can be beyond 16MB and + /* + * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default * operand size is 16bit. Use lgdtl instead to force operand size * to 32 bit. */ - lidtl boot_idt_descr # load idt with 0, 0 lgdtl boot_gdt_descr # load gdt with whatever is appropriate - xor %ax, %ax - inc %ax # protected mode (PE) bit - lmsw %ax # into protected mode + movw $1, %dx # protected mode (PE) bit + lmsw %dx # into protected mode - # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S - ljmpl *(startup_32_smp) + ljmpl $__BOOT_CS, $pa_startup_32 + + .section ".text32","ax" + .code32 +ENTRY(startup_32) # note: also used from wakeup_asm.S + jmp *%eax .data .globl startup_32_smp, boot_gdt, trampoline_status @@ -82,5 +87,4 @@ trampoline_status: .long 0 startup_32_smp: - .long 0x00000000 - .word __BOOT_CS, 0 + .long 0 diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S index b61126cb599e..4c5c5f2bfbec 100644 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -124,9 +124,11 @@ wakeup_start: lgdtl pmode_gdt /* This really couldn't... */ - movl pmode_cr0, %eax - movl %eax, %cr0 - ljmpl *pmode_entry + movl pmode_entry, %eax + movl pmode_cr0, %ecx + movl %ecx, %cr0 + ljmpl $__KERNEL_CS, $pa_startup_32 + /* -> jmp *%eax in trampoline_32.S */ #else jmp trampoline_data #endif -- cgit v1.2.3 From e5684ec438a094bec0f7d5c52652c0901b48b613 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:37 +0300 Subject: x86, realmode: Replace open-coded ljmpw with a macro We cannot code an ljmpw to the real-mode segment directly, because gas refuses to assemble an ljmp with a symbolic segment. Instead of open-coding it everywhere, define a macro and use it for this case. This is specifically an ljmpw from a 16-bit segment. This is okay, as one should never enter real mode from a 32-bit segment: if one do, the CPU ends up in a bizarre (and useless) mode sometimes called "unreal mode" where segments behave like real mode but the default address and operand sizes is 32 bits. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-15-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/realmode.h | 16 ++++++++++++++++ arch/x86/realmode/rm/reboot_32.S | 6 ++---- arch/x86/realmode/rm/trampoline_32.S | 5 ++--- arch/x86/realmode/rm/trampoline_64.S | 5 ++--- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 9 +++------ 5 files changed, 25 insertions(+), 16 deletions(-) create mode 100644 arch/x86/realmode/rm/realmode.h (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h new file mode 100644 index 000000000000..15ab6335f843 --- /dev/null +++ b/arch/x86/realmode/rm/realmode.h @@ -0,0 +1,16 @@ +#ifndef ARCH_X86_REALMODE_RM_REALMODE_H +#define ARCH_X86_REALMODE_RM_REALMODE_H + +#ifdef __ASSEMBLY__ + +/* + * 16-bit ljmpw to the real_mode_seg + * + * This must be open-coded since gas will choke on using a + * relocatable symbol for the segment portion. + */ +#define LJMPW_RM(to) .byte 0xea ; .word (to), real_mode_seg + +#endif /* __ASSEMBLY__ */ + +#endif /* ARCH_X86_REALMODE_RM_REALMODE_H */ diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index 83803c222b4a..e90f8c4bbae2 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -2,6 +2,7 @@ #include #include #include +#include "realmode.h" /* * The following code and data reboots the machine by switching to real @@ -82,10 +83,7 @@ machine_real_restart_asm16: 2: andb $0x10, %dl movl %edx, %cr0 - .byte 0xea /* ljmpw */ - .word 3f /* Offset */ - .word real_mode_seg /* Segment */ - + LJMPW_RM(3f) 3: testb $0, %al jz bios diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 1315ef48dbf1..279f82ef7a9e 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -29,6 +29,7 @@ #include #include #include +#include "realmode.h" .text .code16 @@ -38,9 +39,7 @@ trampoline_data: wbinvd # Needed for NUMA-Q should be harmless for others - .byte 0xea # ljmpw - .word 1f # Offset - .word real_mode_seg # Segment + LJMPW_RM(1f) 1: mov %cs, %ax # Code and data in the same place mov %ax, %ds diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 77b72b45d705..7459c52f0c25 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -31,6 +31,7 @@ #include #include #include +#include "realmode.h" .text .balign PAGE_SIZE @@ -40,9 +41,7 @@ ENTRY(trampoline_data) cli # We should be safe anyway wbinvd - .byte 0xea # ljmpw - .word 1f # Offset - .word real_mode_seg # Segment + LJMPW_RM(1f) 1: mov %cs, %ax # Code and data in the same place mov %ax, %ds diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S index 4c5c5f2bfbec..8064e1c3591b 100644 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -6,6 +6,7 @@ #include #include #include +#include "../realmode.h" #include "wakeup.h" .code16 @@ -36,9 +37,7 @@ wakeup_start: cli cld - .byte 0xea /* ljmpw */ - .word 3f - .word real_mode_seg + LJMPW_RM(3f) 3: /* Apparently some dimwit BIOS programmers don't know how to program a PM to RM transition, and we might end up here with @@ -59,9 +58,7 @@ wakeup_start: andb $~X86_CR0_PE, %al movl %eax, %cr0 - .byte 0xea /* ljmpw */ - .word 3f - .word real_mode_seg + LJMPW_RM(3f) 3: /* Set up segments */ movw %cs, %ax -- cgit v1.2.3 From be60828920d23758da8124bed771404a0438f369 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:38 +0300 Subject: x86, realmode: Move trampoline_*.S early in the link order Move trampoline_*.S earlier in the link order so it ends up being first in the text segment; since the SIPI vector requires 4K alignment it otherwise ends up padding the .text segment with that much completely unnecessarily. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-16-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 2432acb6b04f..2423142b4da4 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -12,8 +12,8 @@ subdir- := wakeup always := realmode.bin realmode-y += header.o -realmode-$(CONFIG_X86_32) += reboot_32.o realmode-y += trampoline_$(BITS).o +realmode-$(CONFIG_X86_32) += reboot_32.o realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o targets += $(realmode-y) -- cgit v1.2.3 From 6feb592dceaed1a6cf26c9747b1180958d5156cd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 8 May 2012 21:22:39 +0300 Subject: x86, realmode: Fix always-zero test in reboot_32.S A test instruction is an "and", and an and with zero is always zero. This would cause us to always take the BIOS path, not the APM path, in case anyone actually cares... Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-17-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/reboot_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index e90f8c4bbae2..50ba994ba921 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -85,7 +85,7 @@ machine_real_restart_asm16: movl %edx, %cr0 LJMPW_RM(3f) 3: - testb $0, %al + andw %ax, %ax jz bios apm: -- cgit v1.2.3 From 8e029fcdd8702719c9179317cae9ef84ebe7027e Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:40 +0300 Subject: x86, realmode: fix 64-bit wakeup sequence There were number of issues in wakeup sequence: - Wakeup stack was placed in hardcoded address. - NX bit in EFER was not enabled. - Initialization incorrectly set physical address of secondary_startup_64. - Some alignment issues. This patch fixes these issues and in addition: - Unifies coding conventions in .S files. - Sets alignments of code and data right. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-18-git-send-email-jarkko.sakkinen@intel.com Originally-by: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/realmode.c | 2 +- arch/x86/realmode/rm/Makefile | 1 + arch/x86/realmode/rm/header.S | 2 +- arch/x86/realmode/rm/reboot_32.S | 18 ++++---- arch/x86/realmode/rm/stack.S | 19 ++++++++ arch/x86/realmode/rm/trampoline_32.S | 29 ++++++------ arch/x86/realmode/rm/trampoline_64.S | 67 ++++++++++++---------------- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 75 +++++++++++++++----------------- arch/x86/realmode/rmpiggy.S | 4 +- 9 files changed, 110 insertions(+), 107 deletions(-) create mode 100644 arch/x86/realmode/rm/stack.S (limited to 'arch/x86') diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index d85ac20bb4eb..e7bf82a409bf 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -64,7 +64,7 @@ void __init setup_real_mode(void) *((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt); #else *((u64 *) __va(real_mode_header.startup_64_smp)) = - (u64) __pa(secondary_startup_64); + (u64)secondary_startup_64; *((u64 *) __va(real_mode_header.level3_ident_pgt)) = __pa(level3_ident_pgt) + _KERNPG_TABLE; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 2423142b4da4..c2c27a41ab8f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -13,6 +13,7 @@ always := realmode.bin realmode-y += header.o realmode-y += trampoline_$(BITS).o +realmode-y += stack.o realmode-$(CONFIG_X86_32) += reboot_32.o realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 730b1316c099..a91ec8f6b15f 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -9,7 +9,7 @@ .section ".header", "a" -ENTRY(real_mode_header) +GLOBAL(real_mode_header) .long pa_text_start .long pa_ro_end .long pa_end diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index 50ba994ba921..8d9bfd13a93e 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -16,10 +16,9 @@ */ .section ".text32", "ax" .code32 - .globl machine_real_restart_asm - .balign 16 -machine_real_restart_asm: + .balign 16 +ENTRY(machine_real_restart_asm) /* Set up the IDT for real mode. */ lidtl pa_machine_real_restart_idt @@ -67,7 +66,7 @@ machine_real_restart_asm: .text .code16 - .balign 16 + .balign 16 machine_real_restart_asm16: 1: xorl %ecx, %ecx @@ -102,15 +101,15 @@ bios: ljmpw $0xf000, $0xfff0 .section ".rodata", "a" - .globl machine_real_restart_idt, machine_real_restart_gdt - .balign 16 -machine_real_restart_idt: + .balign 16 +GLOBAL(machine_real_restart_idt) .word 0xffff /* Length - real mode default value */ .long 0 /* Base - real mode default value */ +END(machine_real_restart_idt) - .balign 16 -machine_real_restart_gdt: + .balign 16 +GLOBAL(machine_real_restart_gdt) /* Self-pointer */ .word 0xffff /* Length - real mode default value */ .long pa_machine_real_restart_gdt @@ -130,3 +129,4 @@ machine_real_restart_gdt: * semantics we don't have to reload the segments once CR0.PE = 0. */ .quad GDT_ENTRY(0x0093, 0x100, 0xffff) +END(machine_real_restart_gdt) diff --git a/arch/x86/realmode/rm/stack.S b/arch/x86/realmode/rm/stack.S new file mode 100644 index 000000000000..867ae87adfae --- /dev/null +++ b/arch/x86/realmode/rm/stack.S @@ -0,0 +1,19 @@ +/* + * Common heap and stack allocations + */ + +#include + + .data +GLOBAL(HEAP) + .long rm_heap +GLOBAL(heap_end) + .long rm_stack + + .bss + .balign 16 +GLOBAL(rm_heap) + .space 2048 +GLOBAL(rm_stack) + .space 2048 +GLOBAL(rm_stack_end) diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 279f82ef7a9e..1ecdbb59191b 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -33,10 +33,9 @@ .text .code16 - .globl trampoline_data - .balign PAGE_SIZE -trampoline_data: + .balign PAGE_SIZE +ENTRY(trampoline_data) wbinvd # Needed for NUMA-Q should be harmless for others LJMPW_RM(1f) @@ -70,20 +69,22 @@ trampoline_data: ENTRY(startup_32) # note: also used from wakeup_asm.S jmp *%eax - .data - .globl startup_32_smp, boot_gdt, trampoline_status - .balign 4 -boot_gdt_descr: - .word __BOOT_DS + 7 # gdt limit -boot_gdt: - .long 0 # gdt base + .section ".rodata","a" + .balign 4 boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L -trampoline_status: - .long 0 + .data -startup_32_smp: - .long 0 +boot_gdt_descr: + .word __BOOT_DS + 7 # gdt limit +GLOBAL(boot_gdt) + .long 0 # gdt base + + .bss + + .balign 4 +GLOBAL(trampoline_status) .space 4 +GLOBAL(startup_32_smp) .space 4 diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 7459c52f0c25..f71ea0800d3d 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -52,7 +52,7 @@ ENTRY(trampoline_data) # write marker for master knows we're running # Setup stack - movw $trampoline_stack_end, %sp + movl $rm_stack_end, %esp call verify_cpu # Verify the cpu supports long mode testl %eax, %eax # Check for return code @@ -68,8 +68,11 @@ ENTRY(trampoline_data) lidtl tidt # load idt with 0, 0 lgdtl tgdt # load gdt with whatever is appropriate - mov $X86_CR0_PE, %ax # protected mode (PE) bit - lmsw %ax # into protected mode + movw $__KERNEL_DS, %dx # Data segment descriptor + + # Enable protected mode + movl $X86_CR0_PE, %eax # protected mode (PE) bit + movl %eax, %cr0 # into protected mode # flush prefetch and jump to startup_32 ljmpl $__KERNEL32_CS, $pa_startup_32 @@ -83,27 +86,27 @@ no_longmode: .code32 .balign 4 ENTRY(startup_32) - movl $__KERNEL_DS, %eax # Initialize the %ds segment register - movl %eax, %ds + movl %edx, %ss + addl $pa_real_mode_base, %esp + movl %edx, %ds + movl %edx, %es + movl %edx, %fs + movl %edx, %gs movl $X86_CR4_PAE, %eax movl %eax, %cr4 # Enable PAE mode - movl pa_startup_64_smp, %esi - movl pa_startup_64_smp_high, %edi - - # Setup trampoline 4 level pagetables - leal pa_trampoline_level4_pgt, %eax + # Setup trampoline 4 level pagetables + movl $pa_level3_ident_pgt, %eax movl %eax, %cr3 movl $MSR_EFER, %ecx - movl $(1 << _EFER_LME), %eax # Enable Long Mode + movl $((1 << _EFER_LME) | (1 << _EFER_NX)), %eax # Enable Long Mode xorl %edx, %edx wrmsr # Enable paging and in turn activate Long Mode - # Enable protected mode - movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl $(X86_CR0_PG | X86_CR0_WP | X86_CR0_PE), %eax movl %eax, %cr0 /* @@ -119,10 +122,7 @@ ENTRY(startup_32) .balign 4 ENTRY(startup_64) # Now jump into the kernel using virtual addresses - movl %edi, %eax - shlq $32, %rax - addl %esi, %eax - jmp *%rax + jmpq *startup_64_smp(%rip) .section ".rodata","a" .balign 16 @@ -132,10 +132,10 @@ tidt: # Duplicate the global descriptor table # so the kernel can live anywhere - .balign 4 + .balign 16 .globl tgdt tgdt: - .short tgdt_end - tgdt # gdt limit + .short tgdt_end - tgdt - 1 # gdt limit .long pa_tgdt .short 0 .quad 0x00cf9b000000ffff # __KERNEL32_CS @@ -143,23 +143,12 @@ tgdt: .quad 0x00cf93000000ffff # __KERNEL_DS tgdt_end: - .data - .balign 4 -GLOBAL(trampoline_status) - .long 0 - -trampoline_stack: - .org 0x1000 -trampoline_stack_end: - - .globl level3_ident_pgt - .globl level3_kernel_pgt -GLOBAL(trampoline_level4_pgt) - level3_ident_pgt: .quad 0 - .fill 510,8,0 - level3_kernel_pgt: .quad 0 - - .globl startup_64_smp - .globl startup_64_smp_high -startup_64_smp: .long 0 -startup_64_smp_high: .long 0 + .bss + + .balign PAGE_SIZE +GLOBAL(level3_ident_pgt) .space 511*8 +GLOBAL(level3_kernel_pgt) .space 8 + + .balign 8 +GLOBAL(startup_64_smp) .space 8 +GLOBAL(trampoline_status) .space 4 diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S index 8064e1c3591b..f81c1cd99eaf 100644 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup/wakeup_asm.S @@ -1,6 +1,7 @@ /* * ACPI wakeup real mode startup stub */ +#include #include #include #include @@ -9,31 +10,33 @@ #include "../realmode.h" #include "wakeup.h" - .code16 + .code16 /* This should match the structure in wakeup.h */ - .section ".data", "aw" - .globl wakeup_header -wakeup_header: -video_mode: .short 0 /* Video mode number */ -pmode_entry: .long 0 -pmode_cs: .short __KERNEL_CS -pmode_cr0: .long 0 /* Saved %cr0 */ -pmode_cr3: .long 0 /* Saved %cr3 */ -pmode_cr4: .long 0 /* Saved %cr4 */ -pmode_efer: .quad 0 /* Saved EFER */ -pmode_gdt: .quad 0 -pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ -pmode_behavior: .long 0 /* Wakeup behavior flags */ -realmode_flags: .long 0 -real_magic: .long 0 -signature: .long WAKEUP_HEADER_SIGNATURE - .size wakeup_header, .-wakeup_header + .section ".data", "aw" + + .balign 16 +GLOBAL(wakeup_header) + video_mode: .short 0 /* Video mode number */ + pmode_entry: .long 0 + pmode_cs: .short __KERNEL_CS + pmode_cr0: .long 0 /* Saved %cr0 */ + pmode_cr3: .long 0 /* Saved %cr3 */ + pmode_cr4: .long 0 /* Saved %cr4 */ + pmode_efer: .quad 0 /* Saved EFER */ + pmode_gdt: .quad 0 + pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ + pmode_behavior: .long 0 /* Wakeup behavior flags */ + realmode_flags: .long 0 + real_magic: .long 0 + signature: .long WAKEUP_HEADER_SIGNATURE +END(wakeup_header) .text .code16 - .globl wakeup_start -wakeup_start: + + .balign 16 +ENTRY(wakeup_start) cli cld @@ -62,12 +65,14 @@ wakeup_start: 3: /* Set up segments */ movw %cs, %ax + movw %ax, %ss + movl $rm_stack_end, %esp movw %ax, %ds movw %ax, %es - movw %ax, %ss - lidtl wakeup_idt + movw %ax, %fs + movw %ax, %gs - movl $wakeup_stack_end, %esp + lidtl wakeup_idt /* Clear the EFLAGS */ pushl $0 @@ -145,9 +150,8 @@ bogus_real_magic: * be the case for other laptops or integrated video devices. */ - .globl wakeup_gdt .balign 16 -wakeup_gdt: +GLOBAL(wakeup_gdt) .word 3*8-1 /* Self-descriptor */ .long pa_wakeup_gdt .word 0 @@ -159,29 +163,18 @@ wakeup_gdt: .word 0xffff /* 16-bit data segment @ real_mode_base */ .long 0x93000000 + pa_real_mode_base .word 0x008f /* big real mode */ - .size wakeup_gdt, .-wakeup_gdt +END(wakeup_gdt) - .data + .section ".rodata","a" .balign 8 /* This is the standard real-mode IDT */ -wakeup_idt: + .balign 16 +GLOBAL(wakeup_idt) .word 0xffff /* limit */ .long 0 /* address */ .word 0 - - .globl HEAP, heap_end -HEAP: - .long wakeup_heap -heap_end: - .long wakeup_stack - - .bss -wakeup_heap: - .space 2048 -wakeup_stack: - .space 2048 -wakeup_stack_end: +END(wakeup_idt) .section ".signature","a" end_signature: diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S index 6047d7f604cf..fd72a99d12ae 100644 --- a/arch/x86/realmode/rmpiggy.S +++ b/arch/x86/realmode/rmpiggy.S @@ -9,10 +9,10 @@ .balign PAGE_SIZE -ENTRY(real_mode_blob) +GLOBAL(real_mode_blob) .incbin "arch/x86/realmode/rm/realmode.bin" END(real_mode_blob) -ENTRY(real_mode_relocs) +GLOBAL(real_mode_relocs) .incbin "arch/x86/realmode/rm/realmode.relocs" END(real_mode_relocs) -- cgit v1.2.3 From b429dbf6e866bd6dadb56fae66f61f611cde57ff Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:41 +0300 Subject: x86, realmode: don't copy real_mode_header Replaced copying of real_mode_header with a pointer to beginning of RM memory. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-19-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 5 ++-- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/realmode.c | 57 ++++++++++++++++--------------------- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/smpboot.c | 4 +-- arch/x86/kernel/tboot.c | 2 +- arch/x86/realmode/rm/header.S | 1 - arch/x86/realmode/rm/realmode.lds.S | 1 - arch/x86/realmode/rmpiggy.S | 2 ++ drivers/acpi/sleep.c | 2 +- 10 files changed, 35 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 1bfc74d213a4..d3ae49f4c3ef 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -8,7 +8,6 @@ struct real_mode_header { u32 text_start; u32 ro_end; - u32 end; /* reboot */ #ifdef CONFIG_X86_32 u32 machine_real_restart_asm; @@ -30,8 +29,8 @@ struct real_mode_header { #endif } __attribute__((__packed__)); -extern struct real_mode_header real_mode_header; -extern unsigned char *real_mode_base; +extern struct real_mode_header *real_mode_header; +extern unsigned char real_mode_blob_end[]; extern unsigned long init_rsp; extern unsigned long initial_code; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index d941b62da4b6..6ca3f54ebe7d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -38,7 +38,7 @@ asmlinkage void acpi_enter_s3(void) int acpi_suspend_lowlevel(void) { struct wakeup_header *header = - (struct wakeup_header *) __va(real_mode_header.wakeup_header); + (struct wakeup_header *) __va(real_mode_header->wakeup_header); if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index e7bf82a409bf..632c810ec8ea 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -5,8 +5,7 @@ #include #include -unsigned char *real_mode_base; -struct real_mode_header real_mode_header; +struct real_mode_header *real_mode_header; void __init setup_real_mode(void) { @@ -17,33 +16,32 @@ void __init setup_real_mode(void) u32 *ptr; u16 *seg; int i; + unsigned char *base; - struct real_mode_header *header = - (struct real_mode_header *) real_mode_blob; - - size_t size = PAGE_ALIGN(header->end); + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); if (!mem) panic("Cannot allocate trampoline\n"); - real_mode_base = __va(mem); + base = __va(mem); memblock_reserve(mem, size); + real_mode_header = (struct real_mode_header *) base; printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - real_mode_base, (unsigned long long)mem, size); + base, (unsigned long long)mem, size); - memcpy(real_mode_base, real_mode_blob, size); + memcpy(base, real_mode_blob, size); - real_mode_seg = __pa(real_mode_base) >> 4; + real_mode_seg = __pa(base) >> 4; rel = (u32 *) real_mode_relocs; /* 16-bit segment relocations. */ count = rel[0]; rel = &rel[1]; for (i = 0; i < count; i++) { - seg = (u16 *) (real_mode_base + rel[i]); + seg = (u16 *) (base + rel[i]); *seg = real_mode_seg; } @@ -51,25 +49,21 @@ void __init setup_real_mode(void) count = rel[i]; rel = &rel[i + 1]; for (i = 0; i < count; i++) { - ptr = (u32 *) (real_mode_base + rel[i]); - *ptr += __pa(real_mode_base); + ptr = (u32 *) (base + rel[i]); + *ptr += __pa(base); } - /* Copied header will contain relocated physical addresses. */ - memcpy(&real_mode_header, real_mode_base, - sizeof(struct real_mode_header)); - #ifdef CONFIG_X86_32 - *((u32 *)__va(real_mode_header.startup_32_smp)) = __pa(startup_32_smp); - *((u32 *)__va(real_mode_header.boot_gdt)) = __pa(boot_gdt); + *((u32 *)__va(real_mode_header->startup_32_smp)) = __pa(startup_32_smp); + *((u32 *)__va(real_mode_header->boot_gdt)) = __pa(boot_gdt); #else - *((u64 *) __va(real_mode_header.startup_64_smp)) = + *((u64 *) __va(real_mode_header->startup_64_smp)) = (u64)secondary_startup_64; - *((u64 *) __va(real_mode_header.level3_ident_pgt)) = + *((u64 *) __va(real_mode_header->level3_ident_pgt)) = __pa(level3_ident_pgt) + _KERNPG_TABLE; - *((u64 *) __va(real_mode_header.level3_kernel_pgt)) = + *((u64 *) __va(real_mode_header->level3_kernel_pgt)) = __pa(level3_kernel_pgt) + _KERNPG_TABLE; #endif } @@ -82,23 +76,22 @@ void __init setup_real_mode(void) */ static int __init set_real_mode_permissions(void) { - size_t all_size = - PAGE_ALIGN(real_mode_header.end) - - __pa(real_mode_base); + unsigned char *base = (unsigned char *) real_mode_header; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); size_t ro_size = - PAGE_ALIGN(real_mode_header.ro_end) - - __pa(real_mode_base); + PAGE_ALIGN(real_mode_header->ro_end) - + __pa(base); size_t text_size = - PAGE_ALIGN(real_mode_header.ro_end) - - real_mode_header.text_start; + PAGE_ALIGN(real_mode_header->ro_end) - + real_mode_header->text_start; unsigned long text_start = - (unsigned long) __va(real_mode_header.text_start); + (unsigned long) __va(real_mode_header->text_start); - set_memory_nx((unsigned long) real_mode_base, all_size >> PAGE_SHIFT); - set_memory_ro((unsigned long) real_mode_base, ro_size >> PAGE_SHIFT); + set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); + set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); return 0; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 050eff29a4bb..658f856f09a3 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -336,7 +336,7 @@ core_initcall(reboot_init); void machine_real_restart(unsigned int type) { void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) - real_mode_header.machine_real_restart_asm; + real_mode_header->machine_real_restart_asm; local_irq_disable(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c7971ea74bd0..b8c0661e2341 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -665,9 +665,9 @@ static void __cpuinit announce_cpu(int cpu, int apicid) static int __cpuinit do_boot_cpu(int apicid, int cpu) { volatile u32 *trampoline_status = - (volatile u32 *) __va(real_mode_header.trampoline_status); + (volatile u32 *) __va(real_mode_header->trampoline_status); /* start_ip had better be page-aligned! */ - unsigned long start_ip = real_mode_header.trampoline_data; + unsigned long start_ip = real_mode_header->trampoline_data; unsigned long boot_error = 0; int timeout; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c136e2325062..65adda4fde93 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -202,7 +202,7 @@ static int tboot_setup_sleep(void) } tboot->acpi_sinfo.kernel_s3_resume_vector = - real_mode_header.wakeup_start; + real_mode_header->wakeup_start; return 0; } diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index a91ec8f6b15f..c83005c4d455 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -12,7 +12,6 @@ GLOBAL(real_mode_header) .long pa_text_start .long pa_ro_end - .long pa_end #ifdef CONFIG_X86_32 .long pa_machine_real_restart_asm #endif diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 4d4afcaf5f02..86b2e8d6b1f1 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -65,7 +65,6 @@ SECTIONS .signature : { *(.signature) } - pa_end = .; /DISCARD/ : { *(.note*) diff --git a/arch/x86/realmode/rmpiggy.S b/arch/x86/realmode/rmpiggy.S index fd72a99d12ae..204c6ece0e97 100644 --- a/arch/x86/realmode/rmpiggy.S +++ b/arch/x86/realmode/rmpiggy.S @@ -13,6 +13,8 @@ GLOBAL(real_mode_blob) .incbin "arch/x86/realmode/rm/realmode.bin" END(real_mode_blob) +GLOBAL(real_mode_blob_end); + GLOBAL(real_mode_relocs) .incbin "arch/x86/realmode/rm/realmode.relocs" END(real_mode_relocs) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index e77aa4a1c9f6..06139005c4dd 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -93,7 +93,7 @@ static struct notifier_block tts_notifier = { static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP - unsigned long wakeup_pa = real_mode_header.wakeup_start; + unsigned long wakeup_pa = real_mode_header->wakeup_start; /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { if (!wakeup_pa) -- cgit v1.2.3 From c4845474a01f699966272536e8416222e3f2d2cb Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:42 +0300 Subject: x86, realmode: flattened rm hierachy Simplified hierarchy under rm directory to a flat directory because it is not anymore really justified to have own directory for wakeup code. It only adds more complexity. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-20-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/realmode/rm/Makefile | 20 ++-- arch/x86/realmode/rm/bioscall.S | 1 + arch/x86/realmode/rm/copy.S | 1 + arch/x86/realmode/rm/regs.c | 1 + arch/x86/realmode/rm/video-bios.c | 1 + arch/x86/realmode/rm/video-mode.c | 1 + arch/x86/realmode/rm/video-vesa.c | 1 + arch/x86/realmode/rm/video-vga.c | 1 + arch/x86/realmode/rm/wakemain.c | 82 ++++++++++++++ arch/x86/realmode/rm/wakeup.h | 41 +++++++ arch/x86/realmode/rm/wakeup/.gitignore | 3 - arch/x86/realmode/rm/wakeup/Makefile | 33 ------ arch/x86/realmode/rm/wakeup/bioscall.S | 1 - arch/x86/realmode/rm/wakeup/copy.S | 1 - arch/x86/realmode/rm/wakeup/regs.c | 1 - arch/x86/realmode/rm/wakeup/video-bios.c | 1 - arch/x86/realmode/rm/wakeup/video-mode.c | 1 - arch/x86/realmode/rm/wakeup/video-vesa.c | 1 - arch/x86/realmode/rm/wakeup/video-vga.c | 1 - arch/x86/realmode/rm/wakeup/wakemain.c | 82 -------------- arch/x86/realmode/rm/wakeup/wakeup.h | 41 ------- arch/x86/realmode/rm/wakeup/wakeup_asm.S | 181 ------------------------------- arch/x86/realmode/rm/wakeup_asm.S | 181 +++++++++++++++++++++++++++++++ 24 files changed, 325 insertions(+), 355 deletions(-) create mode 100644 arch/x86/realmode/rm/bioscall.S create mode 100644 arch/x86/realmode/rm/copy.S create mode 100644 arch/x86/realmode/rm/regs.c create mode 100644 arch/x86/realmode/rm/video-bios.c create mode 100644 arch/x86/realmode/rm/video-mode.c create mode 100644 arch/x86/realmode/rm/video-vesa.c create mode 100644 arch/x86/realmode/rm/video-vga.c create mode 100644 arch/x86/realmode/rm/wakemain.c create mode 100644 arch/x86/realmode/rm/wakeup.h delete mode 100644 arch/x86/realmode/rm/wakeup/.gitignore delete mode 100644 arch/x86/realmode/rm/wakeup/Makefile delete mode 100644 arch/x86/realmode/rm/wakeup/bioscall.S delete mode 100644 arch/x86/realmode/rm/wakeup/copy.S delete mode 100644 arch/x86/realmode/rm/wakeup/regs.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-bios.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-mode.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-vesa.c delete mode 100644 arch/x86/realmode/rm/wakeup/video-vga.c delete mode 100644 arch/x86/realmode/rm/wakeup/wakemain.c delete mode 100644 arch/x86/realmode/rm/wakeup/wakeup.h delete mode 100644 arch/x86/realmode/rm/wakeup/wakeup_asm.S create mode 100644 arch/x86/realmode/rm/wakeup_asm.S (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 6ca3f54ebe7d..95bf99de9058 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,7 +16,7 @@ #include #include -#include "../../realmode/rm/wakeup/wakeup.h" +#include "../../realmode/rm/wakeup.h" #include "sleep.h" unsigned long acpi_realmode_flags; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index c2c27a41ab8f..fc8854b09dfa 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -7,21 +7,26 @@ # # -subdir- := wakeup - always := realmode.bin realmode-y += header.o realmode-y += trampoline_$(BITS).o realmode-y += stack.o realmode-$(CONFIG_X86_32) += reboot_32.o -realmode-$(CONFIG_ACPI_SLEEP) += wakeup/wakeup.o +realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) + +wakeup-objs := wakeup_asm.o wakemain.o video-mode.o +wakeup-objs += copy.o bioscall.o regs.o +# The link order of the video-*.o modules can matter. In particular, +# video-vga.o *must* be listed first, followed by video-vesa.o. +# Hardware-specific drivers should follow in the order they should be +# probed, and video-bios.o should typically be last. +wakeup-objs += video-vga.o +wakeup-objs += video-vesa.o +wakeup-objs += video-bios.o targets += $(realmode-y) -$(obj)/wakeup/wakeup.o: FORCE - $(Q)$(MAKE) $(build)=$(obj)/wakeup $@ - REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p' @@ -55,7 +60,8 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE # How to compile the 16-bit code. Note we always compile for -march=i386, # that way we can complain to the user if the CPU is insufficient. -KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ \ +KBUILD_CFLAGS := $(LINUXINCLUDE) -m32 -g -Os -D_SETUP -D__KERNEL__ -D_WAKEUP \ + -I$(srctree)/arch/x86/boot \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ diff --git a/arch/x86/realmode/rm/bioscall.S b/arch/x86/realmode/rm/bioscall.S new file mode 100644 index 000000000000..16162d197918 --- /dev/null +++ b/arch/x86/realmode/rm/bioscall.S @@ -0,0 +1 @@ +#include "../../boot/bioscall.S" diff --git a/arch/x86/realmode/rm/copy.S b/arch/x86/realmode/rm/copy.S new file mode 100644 index 000000000000..b785e6f38fdd --- /dev/null +++ b/arch/x86/realmode/rm/copy.S @@ -0,0 +1 @@ +#include "../../boot/copy.S" diff --git a/arch/x86/realmode/rm/regs.c b/arch/x86/realmode/rm/regs.c new file mode 100644 index 000000000000..fbb15b9f9ca9 --- /dev/null +++ b/arch/x86/realmode/rm/regs.c @@ -0,0 +1 @@ +#include "../../boot/regs.c" diff --git a/arch/x86/realmode/rm/video-bios.c b/arch/x86/realmode/rm/video-bios.c new file mode 100644 index 000000000000..848b25aaf11b --- /dev/null +++ b/arch/x86/realmode/rm/video-bios.c @@ -0,0 +1 @@ +#include "../../boot/video-bios.c" diff --git a/arch/x86/realmode/rm/video-mode.c b/arch/x86/realmode/rm/video-mode.c new file mode 100644 index 000000000000..2a98b7e2368b --- /dev/null +++ b/arch/x86/realmode/rm/video-mode.c @@ -0,0 +1 @@ +#include "../../boot/video-mode.c" diff --git a/arch/x86/realmode/rm/video-vesa.c b/arch/x86/realmode/rm/video-vesa.c new file mode 100644 index 000000000000..413edddb51e5 --- /dev/null +++ b/arch/x86/realmode/rm/video-vesa.c @@ -0,0 +1 @@ +#include "../../boot/video-vesa.c" diff --git a/arch/x86/realmode/rm/video-vga.c b/arch/x86/realmode/rm/video-vga.c new file mode 100644 index 000000000000..3085f5c9d288 --- /dev/null +++ b/arch/x86/realmode/rm/video-vga.c @@ -0,0 +1 @@ +#include "../../boot/video-vga.c" diff --git a/arch/x86/realmode/rm/wakemain.c b/arch/x86/realmode/rm/wakemain.c new file mode 100644 index 000000000000..91405d515ec6 --- /dev/null +++ b/arch/x86/realmode/rm/wakemain.c @@ -0,0 +1,82 @@ +#include "wakeup.h" +#include "boot.h" + +static void udelay(int loops) +{ + while (loops--) + io_delay(); /* Approximately 1 us */ +} + +static void beep(unsigned int hz) +{ + u8 enable; + + if (!hz) { + enable = 0x00; /* Turn off speaker */ + } else { + u16 div = 1193181/hz; + + outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ + io_delay(); + outb(div, 0x42); /* LSB of counter */ + io_delay(); + outb(div >> 8, 0x42); /* MSB of counter */ + io_delay(); + + enable = 0x03; /* Turn on speaker */ + } + inb(0x61); /* Dummy read of System Control Port B */ + io_delay(); + outb(enable, 0x61); /* Enable timer 2 output to speaker */ + io_delay(); +} + +#define DOT_HZ 880 +#define DASH_HZ 587 +#define US_PER_DOT 125000 + +/* Okay, this is totally silly, but it's kind of fun. */ +static void send_morse(const char *pattern) +{ + char s; + + while ((s = *pattern++)) { + switch (s) { + case '.': + beep(DOT_HZ); + udelay(US_PER_DOT); + beep(0); + udelay(US_PER_DOT); + break; + case '-': + beep(DASH_HZ); + udelay(US_PER_DOT * 3); + beep(0); + udelay(US_PER_DOT); + break; + default: /* Assume it's a space */ + udelay(US_PER_DOT * 3); + break; + } + } +} + +void main(void) +{ + /* Kill machine if structures are wrong */ + if (wakeup_header.real_magic != 0x12345678) + while (1) + ; + + if (wakeup_header.realmode_flags & 4) + send_morse("...-"); + + if (wakeup_header.realmode_flags & 1) + asm volatile("lcallw $0xc000,$3"); + + if (wakeup_header.realmode_flags & 2) { + /* Need to call BIOS */ + probe_cards(0); + set_mode(wakeup_header.video_mode); + } +} diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h new file mode 100644 index 000000000000..2dfaf06b8af1 --- /dev/null +++ b/arch/x86/realmode/rm/wakeup.h @@ -0,0 +1,41 @@ +/* + * Definitions for the wakeup data structure at the head of the + * wakeup code. + */ + +#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H +#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H + +#ifndef __ASSEMBLY__ +#include + +/* This must match data at wakeup.S */ +struct wakeup_header { + u16 video_mode; /* Video mode number */ + u32 pmode_entry; /* Protected mode resume point, 32-bit only */ + u16 pmode_cs; + u32 pmode_cr0; /* Protected mode cr0 */ + u32 pmode_cr3; /* Protected mode cr3 */ + u32 pmode_cr4; /* Protected mode cr4 */ + u32 pmode_efer_low; /* Protected mode EFER */ + u32 pmode_efer_high; + u64 pmode_gdt; + u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ + u32 pmode_misc_en_high; + u32 pmode_behavior; /* Wakeup routine behavior flags */ + u32 realmode_flags; + u32 real_magic; + u32 signature; /* To check we have correct structure */ +} __attribute__((__packed__)); + +extern struct wakeup_header wakeup_header; +#endif + +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 + +/* Wakeup behavior bits */ +#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 + +#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup/.gitignore b/arch/x86/realmode/rm/wakeup/.gitignore deleted file mode 100644 index 58f1f48a58f8..000000000000 --- a/arch/x86/realmode/rm/wakeup/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -wakeup.bin -wakeup.elf -wakeup.lds diff --git a/arch/x86/realmode/rm/wakeup/Makefile b/arch/x86/realmode/rm/wakeup/Makefile deleted file mode 100644 index 4c8533240cdd..000000000000 --- a/arch/x86/realmode/rm/wakeup/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -# -# arch/x86/kernel/acpi/realmode/Makefile -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# - -always := wakeup.o - -wakeup-y += wakeup_asm.o wakemain.o video-mode.o -wakeup-y += copy.o bioscall.o regs.o - -# The link order of the video-*.o modules can matter. In particular, -# video-vga.o *must* be listed first, followed by video-vesa.o. -# Hardware-specific drivers should follow in the order they should be -# probed, and video-bios.o should typically be last. -wakeup-y += video-vga.o -wakeup-y += video-vesa.o -wakeup-y += video-bios.o - -targets += $(wakeup-y) - -WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) - -LDFLAGS_wakeup.o := -m elf_i386 -r -$(obj)/wakeup.o: $(WAKEUP_OBJS) FORCE - $(call if_changed,ld) - -bootsrc := $(src)/../../../boot - -ccflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) -asflags-y += -D_WAKEUP -I$(srctree)/$(bootsrc) diff --git a/arch/x86/realmode/rm/wakeup/bioscall.S b/arch/x86/realmode/rm/wakeup/bioscall.S deleted file mode 100644 index f51eb0bb56ce..000000000000 --- a/arch/x86/realmode/rm/wakeup/bioscall.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/bioscall.S" diff --git a/arch/x86/realmode/rm/wakeup/copy.S b/arch/x86/realmode/rm/wakeup/copy.S deleted file mode 100644 index dc59ebee69d8..000000000000 --- a/arch/x86/realmode/rm/wakeup/copy.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/copy.S" diff --git a/arch/x86/realmode/rm/wakeup/regs.c b/arch/x86/realmode/rm/wakeup/regs.c deleted file mode 100644 index 6206033ba202..000000000000 --- a/arch/x86/realmode/rm/wakeup/regs.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/regs.c" diff --git a/arch/x86/realmode/rm/wakeup/video-bios.c b/arch/x86/realmode/rm/wakeup/video-bios.c deleted file mode 100644 index 7deabc144a27..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-bios.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-bios.c" diff --git a/arch/x86/realmode/rm/wakeup/video-mode.c b/arch/x86/realmode/rm/wakeup/video-mode.c deleted file mode 100644 index 328ad209f113..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-mode.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-mode.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vesa.c b/arch/x86/realmode/rm/wakeup/video-vesa.c deleted file mode 100644 index 9dbb9672226a..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-vesa.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vesa.c" diff --git a/arch/x86/realmode/rm/wakeup/video-vga.c b/arch/x86/realmode/rm/wakeup/video-vga.c deleted file mode 100644 index bcc81255f374..000000000000 --- a/arch/x86/realmode/rm/wakeup/video-vga.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vga.c" diff --git a/arch/x86/realmode/rm/wakeup/wakemain.c b/arch/x86/realmode/rm/wakeup/wakemain.c deleted file mode 100644 index 91405d515ec6..000000000000 --- a/arch/x86/realmode/rm/wakeup/wakemain.c +++ /dev/null @@ -1,82 +0,0 @@ -#include "wakeup.h" -#include "boot.h" - -static void udelay(int loops) -{ - while (loops--) - io_delay(); /* Approximately 1 us */ -} - -static void beep(unsigned int hz) -{ - u8 enable; - - if (!hz) { - enable = 0x00; /* Turn off speaker */ - } else { - u16 div = 1193181/hz; - - outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ - io_delay(); - outb(div, 0x42); /* LSB of counter */ - io_delay(); - outb(div >> 8, 0x42); /* MSB of counter */ - io_delay(); - - enable = 0x03; /* Turn on speaker */ - } - inb(0x61); /* Dummy read of System Control Port B */ - io_delay(); - outb(enable, 0x61); /* Enable timer 2 output to speaker */ - io_delay(); -} - -#define DOT_HZ 880 -#define DASH_HZ 587 -#define US_PER_DOT 125000 - -/* Okay, this is totally silly, but it's kind of fun. */ -static void send_morse(const char *pattern) -{ - char s; - - while ((s = *pattern++)) { - switch (s) { - case '.': - beep(DOT_HZ); - udelay(US_PER_DOT); - beep(0); - udelay(US_PER_DOT); - break; - case '-': - beep(DASH_HZ); - udelay(US_PER_DOT * 3); - beep(0); - udelay(US_PER_DOT); - break; - default: /* Assume it's a space */ - udelay(US_PER_DOT * 3); - break; - } - } -} - -void main(void) -{ - /* Kill machine if structures are wrong */ - if (wakeup_header.real_magic != 0x12345678) - while (1) - ; - - if (wakeup_header.realmode_flags & 4) - send_morse("...-"); - - if (wakeup_header.realmode_flags & 1) - asm volatile("lcallw $0xc000,$3"); - - if (wakeup_header.realmode_flags & 2) { - /* Need to call BIOS */ - probe_cards(0); - set_mode(wakeup_header.video_mode); - } -} diff --git a/arch/x86/realmode/rm/wakeup/wakeup.h b/arch/x86/realmode/rm/wakeup/wakeup.h deleted file mode 100644 index 2dfaf06b8af1..000000000000 --- a/arch/x86/realmode/rm/wakeup/wakeup.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Definitions for the wakeup data structure at the head of the - * wakeup code. - */ - -#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H -#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H - -#ifndef __ASSEMBLY__ -#include - -/* This must match data at wakeup.S */ -struct wakeup_header { - u16 video_mode; /* Video mode number */ - u32 pmode_entry; /* Protected mode resume point, 32-bit only */ - u16 pmode_cs; - u32 pmode_cr0; /* Protected mode cr0 */ - u32 pmode_cr3; /* Protected mode cr3 */ - u32 pmode_cr4; /* Protected mode cr4 */ - u32 pmode_efer_low; /* Protected mode EFER */ - u32 pmode_efer_high; - u64 pmode_gdt; - u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ - u32 pmode_misc_en_high; - u32 pmode_behavior; /* Wakeup routine behavior flags */ - u32 realmode_flags; - u32 real_magic; - u32 signature; /* To check we have correct structure */ -} __attribute__((__packed__)); - -extern struct wakeup_header wakeup_header; -#endif - -#define WAKEUP_HEADER_OFFSET 8 -#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 -#define WAKEUP_END_SIGNATURE 0x65a22c82 - -/* Wakeup behavior bits */ -#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 - -#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup/wakeup_asm.S b/arch/x86/realmode/rm/wakeup/wakeup_asm.S deleted file mode 100644 index f81c1cd99eaf..000000000000 --- a/arch/x86/realmode/rm/wakeup/wakeup_asm.S +++ /dev/null @@ -1,181 +0,0 @@ -/* - * ACPI wakeup real mode startup stub - */ -#include -#include -#include -#include -#include -#include -#include "../realmode.h" -#include "wakeup.h" - - .code16 - -/* This should match the structure in wakeup.h */ - .section ".data", "aw" - - .balign 16 -GLOBAL(wakeup_header) - video_mode: .short 0 /* Video mode number */ - pmode_entry: .long 0 - pmode_cs: .short __KERNEL_CS - pmode_cr0: .long 0 /* Saved %cr0 */ - pmode_cr3: .long 0 /* Saved %cr3 */ - pmode_cr4: .long 0 /* Saved %cr4 */ - pmode_efer: .quad 0 /* Saved EFER */ - pmode_gdt: .quad 0 - pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ - pmode_behavior: .long 0 /* Wakeup behavior flags */ - realmode_flags: .long 0 - real_magic: .long 0 - signature: .long WAKEUP_HEADER_SIGNATURE -END(wakeup_header) - - .text - .code16 - - .balign 16 -ENTRY(wakeup_start) - cli - cld - - LJMPW_RM(3f) -3: - /* Apparently some dimwit BIOS programmers don't know how to - program a PM to RM transition, and we might end up here with - junk in the data segment descriptor registers. The only way - to repair that is to go into PM and fix it ourselves... */ - movw $16, %cx - lgdtl %cs:wakeup_gdt - movl %cr0, %eax - orb $X86_CR0_PE, %al - movl %eax, %cr0 - ljmpw $8, $2f -2: - movw %cx, %ds - movw %cx, %es - movw %cx, %ss - movw %cx, %fs - movw %cx, %gs - - andb $~X86_CR0_PE, %al - movl %eax, %cr0 - LJMPW_RM(3f) -3: - /* Set up segments */ - movw %cs, %ax - movw %ax, %ss - movl $rm_stack_end, %esp - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - - lidtl wakeup_idt - - /* Clear the EFLAGS */ - pushl $0 - popfl - - /* Check header signature... */ - movl signature, %eax - cmpl $WAKEUP_HEADER_SIGNATURE, %eax - jne bogus_real_magic - - /* Check we really have everything... */ - movl end_signature, %eax - cmpl $WAKEUP_END_SIGNATURE, %eax - jne bogus_real_magic - - /* Call the C code */ - calll main - - /* Restore MISC_ENABLE before entering protected mode, in case - BIOS decided to clear XD_DISABLE during S3. */ - movl pmode_behavior, %eax - btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax - jnc 1f - - movl pmode_misc_en, %eax - movl pmode_misc_en + 4, %edx - movl $MSR_IA32_MISC_ENABLE, %ecx - wrmsr -1: - - /* Do any other stuff... */ - -#ifndef CONFIG_64BIT - /* This could also be done in C code... */ - movl pmode_cr3, %eax - movl %eax, %cr3 - - movl pmode_cr4, %ecx - jecxz 1f - movl %ecx, %cr4 -1: - movl pmode_efer, %eax - movl pmode_efer + 4, %edx - movl %eax, %ecx - orl %edx, %ecx - jz 1f - movl $MSR_EFER, %ecx - wrmsr -1: - - lgdtl pmode_gdt - - /* This really couldn't... */ - movl pmode_entry, %eax - movl pmode_cr0, %ecx - movl %ecx, %cr0 - ljmpl $__KERNEL_CS, $pa_startup_32 - /* -> jmp *%eax in trampoline_32.S */ -#else - jmp trampoline_data -#endif - -bogus_real_magic: -1: - hlt - jmp 1b - - .section ".rodata","a" - - /* - * Set up the wakeup GDT. We set these up as Big Real Mode, - * that is, with limits set to 4 GB. At least the Lenovo - * Thinkpad X61 is known to need this for the video BIOS - * initialization quirk to work; this is likely to also - * be the case for other laptops or integrated video devices. - */ - - .balign 16 -GLOBAL(wakeup_gdt) - .word 3*8-1 /* Self-descriptor */ - .long pa_wakeup_gdt - .word 0 - - .word 0xffff /* 16-bit code segment @ real_mode_base */ - .long 0x9b000000 + pa_real_mode_base - .word 0x008f /* big real mode */ - - .word 0xffff /* 16-bit data segment @ real_mode_base */ - .long 0x93000000 + pa_real_mode_base - .word 0x008f /* big real mode */ -END(wakeup_gdt) - - .section ".rodata","a" - .balign 8 - - /* This is the standard real-mode IDT */ - .balign 16 -GLOBAL(wakeup_idt) - .word 0xffff /* limit */ - .long 0 /* address */ - .word 0 -END(wakeup_idt) - - .section ".signature","a" -end_signature: - .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S new file mode 100644 index 000000000000..8a57c5a05fbc --- /dev/null +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -0,0 +1,181 @@ +/* + * ACPI wakeup real mode startup stub + */ +#include +#include +#include +#include +#include +#include +#include "realmode.h" +#include "wakeup.h" + + .code16 + +/* This should match the structure in wakeup.h */ + .section ".data", "aw" + + .balign 16 +GLOBAL(wakeup_header) + video_mode: .short 0 /* Video mode number */ + pmode_entry: .long 0 + pmode_cs: .short __KERNEL_CS + pmode_cr0: .long 0 /* Saved %cr0 */ + pmode_cr3: .long 0 /* Saved %cr3 */ + pmode_cr4: .long 0 /* Saved %cr4 */ + pmode_efer: .quad 0 /* Saved EFER */ + pmode_gdt: .quad 0 + pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ + pmode_behavior: .long 0 /* Wakeup behavior flags */ + realmode_flags: .long 0 + real_magic: .long 0 + signature: .long WAKEUP_HEADER_SIGNATURE +END(wakeup_header) + + .text + .code16 + + .balign 16 +ENTRY(wakeup_start) + cli + cld + + LJMPW_RM(3f) +3: + /* Apparently some dimwit BIOS programmers don't know how to + program a PM to RM transition, and we might end up here with + junk in the data segment descriptor registers. The only way + to repair that is to go into PM and fix it ourselves... */ + movw $16, %cx + lgdtl %cs:wakeup_gdt + movl %cr0, %eax + orb $X86_CR0_PE, %al + movl %eax, %cr0 + ljmpw $8, $2f +2: + movw %cx, %ds + movw %cx, %es + movw %cx, %ss + movw %cx, %fs + movw %cx, %gs + + andb $~X86_CR0_PE, %al + movl %eax, %cr0 + LJMPW_RM(3f) +3: + /* Set up segments */ + movw %cs, %ax + movw %ax, %ss + movl $rm_stack_end, %esp + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + + lidtl wakeup_idt + + /* Clear the EFLAGS */ + pushl $0 + popfl + + /* Check header signature... */ + movl signature, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax + jne bogus_real_magic + + /* Check we really have everything... */ + movl end_signature, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax + jne bogus_real_magic + + /* Call the C code */ + calll main + + /* Restore MISC_ENABLE before entering protected mode, in case + BIOS decided to clear XD_DISABLE during S3. */ + movl pmode_behavior, %eax + btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax + jnc 1f + + movl pmode_misc_en, %eax + movl pmode_misc_en + 4, %edx + movl $MSR_IA32_MISC_ENABLE, %ecx + wrmsr +1: + + /* Do any other stuff... */ + +#ifndef CONFIG_64BIT + /* This could also be done in C code... */ + movl pmode_cr3, %eax + movl %eax, %cr3 + + movl pmode_cr4, %ecx + jecxz 1f + movl %ecx, %cr4 +1: + movl pmode_efer, %eax + movl pmode_efer + 4, %edx + movl %eax, %ecx + orl %edx, %ecx + jz 1f + movl $MSR_EFER, %ecx + wrmsr +1: + + lgdtl pmode_gdt + + /* This really couldn't... */ + movl pmode_entry, %eax + movl pmode_cr0, %ecx + movl %ecx, %cr0 + ljmpl $__KERNEL_CS, $pa_startup_32 + /* -> jmp *%eax in trampoline_32.S */ +#else + jmp trampoline_data +#endif + +bogus_real_magic: +1: + hlt + jmp 1b + + .section ".rodata","a" + + /* + * Set up the wakeup GDT. We set these up as Big Real Mode, + * that is, with limits set to 4 GB. At least the Lenovo + * Thinkpad X61 is known to need this for the video BIOS + * initialization quirk to work; this is likely to also + * be the case for other laptops or integrated video devices. + */ + + .balign 16 +GLOBAL(wakeup_gdt) + .word 3*8-1 /* Self-descriptor */ + .long pa_wakeup_gdt + .word 0 + + .word 0xffff /* 16-bit code segment @ real_mode_base */ + .long 0x9b000000 + pa_real_mode_base + .word 0x008f /* big real mode */ + + .word 0xffff /* 16-bit data segment @ real_mode_base */ + .long 0x93000000 + pa_real_mode_base + .word 0x008f /* big real mode */ +END(wakeup_gdt) + + .section ".rodata","a" + .balign 8 + + /* This is the standard real-mode IDT */ + .balign 16 +GLOBAL(wakeup_idt) + .word 0xffff /* limit */ + .long 0 /* address */ + .word 0 +END(wakeup_idt) + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE -- cgit v1.2.3 From f37240f16bec91f15ce564515f70a6ca9715ce96 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:43 +0300 Subject: x86, realmode: header for trampoline code Added header for trampoline code that can be used to supply input data to it. This makes interface between real mode code and kernel cleaner and simpler. Replaced two confusing pointers to level4 pgt in trampoline_64.S with a single pointer to the beginning of the page table. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-21-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/realmode.h | 32 +++++++++++++++++----------- arch/x86/kernel/realmode.c | 27 +++++++++++++----------- arch/x86/kernel/smpboot.c | 2 +- arch/x86/realmode/rm/header.S | 35 ++++++++++++++----------------- arch/x86/realmode/rm/trampoline_32.S | 36 ++++++-------------------------- arch/x86/realmode/rm/trampoline_64.S | 18 +++++----------- arch/x86/realmode/rm/trampoline_common.S | 23 ++++++++++++++++++++ arch/x86/realmode/rm/wakeup_asm.S | 2 +- 8 files changed, 87 insertions(+), 88 deletions(-) create mode 100644 arch/x86/realmode/rm/trampoline_common.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index d3ae49f4c3ef..1421eed1c8e8 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -8,24 +8,32 @@ struct real_mode_header { u32 text_start; u32 ro_end; - /* reboot */ -#ifdef CONFIG_X86_32 - u32 machine_real_restart_asm; -#endif /* SMP trampoline */ - u32 trampoline_data; + u32 trampoline_start; u32 trampoline_status; -#ifdef CONFIG_X86_32 - u32 startup_32_smp; - u32 boot_gdt; -#else - u32 startup_64_smp; - u32 level3_ident_pgt; - u32 level3_kernel_pgt; + u32 trampoline_header; +#ifdef CONFIG_X86_64 + u32 trampoline_pgd; #endif + /* ACPI S3 wakeup */ #ifdef CONFIG_ACPI_SLEEP u32 wakeup_start; u32 wakeup_header; +#endif + /* APM/BIOS reboot */ +#ifdef CONFIG_X86_32 + u32 machine_real_restart_asm; +#endif +} __attribute__((__packed__)); + +/* This must match data at trampoline_32/64.S */ +struct trampoline_header { +#ifdef CONFIG_X86_32 + u32 start; + u16 gdt_limit; + u32 gdt_base; +#else + u64 start; #endif } __attribute__((__packed__)); diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 632c810ec8ea..712fba8fd774 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -17,8 +17,11 @@ void __init setup_real_mode(void) u16 *seg; int i; unsigned char *base; - + struct trampoline_header *trampoline_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); +#ifdef CONFIG_X86_64 + u64 *trampoline_pgd; +#endif /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); @@ -28,7 +31,6 @@ void __init setup_real_mode(void) base = __va(mem); memblock_reserve(mem, size); real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", base, (unsigned long long)mem, size); @@ -53,18 +55,19 @@ void __init setup_real_mode(void) *ptr += __pa(base); } + /* Must be perfomed *after* relocation. */ + trampoline_header = (struct trampoline_header *) + __va(real_mode_header->trampoline_header); + #ifdef CONFIG_X86_32 - *((u32 *)__va(real_mode_header->startup_32_smp)) = __pa(startup_32_smp); - *((u32 *)__va(real_mode_header->boot_gdt)) = __pa(boot_gdt); + trampoline_header->start = __pa(startup_32_smp); + trampoline_header->gdt_limit = __BOOT_DS + 7; + trampoline_header->gdt_base = __pa(boot_gdt); #else - *((u64 *) __va(real_mode_header->startup_64_smp)) = - (u64)secondary_startup_64; - - *((u64 *) __va(real_mode_header->level3_ident_pgt)) = - __pa(level3_ident_pgt) + _KERNPG_TABLE; - - *((u64 *) __va(real_mode_header->level3_kernel_pgt)) = - __pa(level3_kernel_pgt) + _KERNPG_TABLE; + trampoline_header->start = (u64) secondary_startup_64; + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); + trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; + trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b8c0661e2341..757c4b1d0a02 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -667,7 +667,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) volatile u32 *trampoline_status = (volatile u32 *) __va(real_mode_header->trampoline_status); /* start_ip had better be page-aligned! */ - unsigned long start_ip = real_mode_header->trampoline_data; + unsigned long start_ip = real_mode_header->trampoline_start; unsigned long boot_error = 0; int timeout; diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index c83005c4d455..b4c32632bf16 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -7,28 +7,25 @@ #include #include - .section ".header", "a" + .section ".header", "a" GLOBAL(real_mode_header) - .long pa_text_start - .long pa_ro_end -#ifdef CONFIG_X86_32 - .long pa_machine_real_restart_asm -#endif - /* SMP trampoline */ - .long pa_trampoline_data - .long pa_trampoline_status -#ifdef CONFIG_X86_32 - .long pa_startup_32_smp - .long pa_boot_gdt -#else - .long pa_startup_64_smp - .long pa_level3_ident_pgt - .long pa_level3_kernel_pgt + .long pa_text_start + .long pa_ro_end + /* SMP trampoline */ + .long pa_trampoline_start + .long pa_trampoline_status + .long pa_trampoline_header +#ifdef CONFIG_X86_64 + .long pa_trampoline_pgd; #endif - /* ACPI sleep */ + /* ACPI S3 wakeup */ #ifdef CONFIG_ACPI_SLEEP - .long pa_wakeup_start - .long pa_wakeup_header + .long pa_wakeup_start + .long pa_wakeup_header +#endif + /* APM/BIOS reboot */ +#ifdef CONFIG_X86_32 + .long pa_machine_real_restart_asm #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 1ecdbb59191b..6fc064b4d2b9 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -13,16 +13,10 @@ * * We jump into arch/x86/kernel/head_32.S. * - * On entry to trampoline_data, the processor is in real mode + * On entry to trampoline_start, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value * and IP is zero. Thus, we load CS to the physical segment * of the real mode code before doing anything further. - * - * The structure real_mode_header includes entries that need - * to be set up before executing this code: - * - * startup_32_smp - * boot_gdt */ #include @@ -35,7 +29,7 @@ .code16 .balign PAGE_SIZE -ENTRY(trampoline_data) +ENTRY(trampoline_start) wbinvd # Needed for NUMA-Q should be harmless for others LJMPW_RM(1f) @@ -45,7 +39,7 @@ ENTRY(trampoline_data) cli # We should be safe anyway - movl startup_32_smp, %eax # where we need to go + movl tr_start, %eax # where we need to go movl $0xA5A5A5A5, trampoline_status # write marker for master knows we're running @@ -56,8 +50,8 @@ ENTRY(trampoline_data) * operand size is 16bit. Use lgdtl instead to force operand size * to 32 bit. */ - lidtl boot_idt_descr # load idt with 0, 0 - lgdtl boot_gdt_descr # load gdt with whatever is appropriate + lidtl tr_idt # load idt with 0, 0 + lgdtl tr_gdt # load gdt with whatever is appropriate movw $1, %dx # protected mode (PE) bit lmsw %dx # into protected mode @@ -69,22 +63,4 @@ ENTRY(trampoline_data) ENTRY(startup_32) # note: also used from wakeup_asm.S jmp *%eax - .section ".rodata","a" - - .balign 4 -boot_idt_descr: - .word 0 # idt limit = 0 - .long 0 # idt base = 0L - - .data - -boot_gdt_descr: - .word __BOOT_DS + 7 # gdt limit -GLOBAL(boot_gdt) - .long 0 # gdt base - - .bss - - .balign 4 -GLOBAL(trampoline_status) .space 4 -GLOBAL(startup_32_smp) .space 4 +#include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index f71ea0800d3d..3f7293239365 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -10,7 +10,7 @@ * trampoline page to make our stack and everything else * is a mystery. * - * On entry to trampoline_data, the processor is in real mode + * On entry to trampoline_start, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value * and IP is zero. Thus, data addresses need to be absolute * (no relocation) and are taken with regard to r_base. @@ -37,7 +37,7 @@ .balign PAGE_SIZE .code16 -ENTRY(trampoline_data) +ENTRY(trampoline_start) cli # We should be safe anyway wbinvd @@ -97,7 +97,7 @@ ENTRY(startup_32) movl %eax, %cr4 # Enable PAE mode # Setup trampoline 4 level pagetables - movl $pa_level3_ident_pgt, %eax + movl $pa_trampoline_pgd, %eax movl %eax, %cr3 movl $MSR_EFER, %ecx @@ -122,7 +122,7 @@ ENTRY(startup_32) .balign 4 ENTRY(startup_64) # Now jump into the kernel using virtual addresses - jmpq *startup_64_smp(%rip) + jmpq *tr_start(%rip) .section ".rodata","a" .balign 16 @@ -143,12 +143,4 @@ tgdt: .quad 0x00cf93000000ffff # __KERNEL_DS tgdt_end: - .bss - - .balign PAGE_SIZE -GLOBAL(level3_ident_pgt) .space 511*8 -GLOBAL(level3_kernel_pgt) .space 8 - - .balign 8 -GLOBAL(startup_64_smp) .space 8 -GLOBAL(trampoline_status) .space 4 +#include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S new file mode 100644 index 000000000000..c3f951c468c5 --- /dev/null +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -0,0 +1,23 @@ + .section ".rodata","a" + + .balign 4 +tr_idt: .fill 1, 6, 0 + + .bss + + .balign 4 +GLOBAL(trampoline_status) .space 4 + +GLOBAL(trampoline_header) +#ifdef CONFIG_X86_32 + tr_start: .space 4 + tr_gdt: .space 6 +#else + tr_start: .space 8 +#endif +END(trampoline_header) + +#ifdef CONFIG_X86_64 + .balign PAGE_SIZE +GLOBAL(trampoline_pgd) .space PAGE_SIZE +#endif diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 8a57c5a05fbc..46108f05e04e 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -132,7 +132,7 @@ ENTRY(wakeup_start) ljmpl $__KERNEL_CS, $pa_startup_32 /* -> jmp *%eax in trampoline_32.S */ #else - jmp trampoline_data + jmp trampoline_start #endif bogus_real_magic: -- cgit v1.2.3 From b2d0b7a061bfddd27155c7dcd53f365d9dc0c7c3 Mon Sep 17 00:00:00 2001 From: Joshua Cov Date: Fri, 13 Apr 2012 21:08:26 +0200 Subject: keyboard: Use BIOS Keyboard variable to set Numlock The PC BIOS does provide a NUMLOCK flag containing the desired state of this LED. This patch sets the current state according to the data in the bios. [ hpa: fixed __weak declaration without definition, changed "inline" to "static inline" ] Signed-Off-By: Joshua Cov Link: http://lkml.kernel.org/r/CAKL7Q7rvq87TNS1T_Km8fW_5OzS%2BSbYazLXKxW-6ztOxo3zorg@mail.gmail.com Acked-by: Alan Cox Signed-off-by: H. Peter Anvin --- arch/parisc/include/asm/kbdleds.h | 19 +++++++++++++++++++ arch/x86/boot/main.c | 18 ++++++++++++------ arch/x86/include/asm/bootparam.h | 3 ++- arch/x86/include/asm/kbdleds.h | 17 +++++++++++++++++ drivers/tty/vt/keyboard.c | 20 ++++++++------------ 5 files changed, 58 insertions(+), 19 deletions(-) create mode 100644 arch/parisc/include/asm/kbdleds.h create mode 100644 arch/x86/include/asm/kbdleds.h (limited to 'arch/x86') diff --git a/arch/parisc/include/asm/kbdleds.h b/arch/parisc/include/asm/kbdleds.h new file mode 100644 index 000000000000..2e2e75a83c28 --- /dev/null +++ b/arch/parisc/include/asm/kbdleds.h @@ -0,0 +1,19 @@ +#ifndef _ASM_PARISC_KBDLEDS_H +#define _ASM_PARISC_KBDLEDS_H + +/* + * On HIL keyboards of PARISC machines there is no NumLock key and + * everyone expects the keypad to be used for numbers. That's why + * we can safely turn on the NUMLOCK bit. + */ + +static inline int kbd_defleds(void) +{ +#if defined(CONFIG_KEYBOARD_HIL) || defined(CONFIG_KEYBOARD_HIL_OLD) + return 1 << VC_NUMLOCK; +#else + return 0; +#endif +} + +#endif /* _ASM_PARISC_KBDLEDS_H */ diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 40358c8905be..cf6083d444f4 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -57,14 +57,20 @@ static void copy_boot_params(void) } /* - * Set the keyboard repeat rate to maximum. Unclear why this + * Query the keyboard lock status as given by the BIOS, and + * set the keyboard repeat rate to maximum. Unclear why the latter * is done here; this might be possible to kill off as stale code. */ -static void keyboard_set_repeat(void) +static void keyboard_init(void) { - struct biosregs ireg; + struct biosregs ireg, oreg; initregs(&ireg); - ireg.ax = 0x0305; + + ireg.ah = 0x02; /* Get keyboard status */ + intcall(0x16, &ireg, &oreg); + boot_params.kbd_status = oreg.al; + + ireg.ax = 0x0305; /* Set keyboard repeat rate */ intcall(0x16, &ireg, NULL); } @@ -151,8 +157,8 @@ void main(void) /* Detect memory layout */ detect_memory(); - /* Set keyboard repeat rate (why?) */ - keyboard_set_repeat(); + /* Set keyboard repeat rate (why?) and query the lock flags */ + keyboard_init(); /* Query MCA information */ query_mca(); diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 2f90c51cc49d..eb45aa6b1f27 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -112,7 +112,8 @@ struct boot_params { __u8 e820_entries; /* 0x1e8 */ __u8 eddbuf_entries; /* 0x1e9 */ __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ - __u8 _pad6[6]; /* 0x1eb */ + __u8 kbd_status; /* 0x1eb */ + __u8 _pad6[5]; /* 0x1ec */ struct setup_header hdr; /* setup header */ /* 0x1f1 */ __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ diff --git a/arch/x86/include/asm/kbdleds.h b/arch/x86/include/asm/kbdleds.h new file mode 100644 index 000000000000..f27ac5ff597d --- /dev/null +++ b/arch/x86/include/asm/kbdleds.h @@ -0,0 +1,17 @@ +#ifndef _ASM_X86_KBDLEDS_H +#define _ASM_X86_KBDLEDS_H + +/* + * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on. + * This seems a good reason to start with NumLock off. That's why on X86 we + * ask the bios for the correct state. + */ + +#include + +static inline int kbd_defleds(void) +{ + return boot_params.kbd_status & 0x20 ? (1 << VC_NUMLOCK) : 0; +} + +#endif /* _ASM_X86_KBDLEDS_H */ diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index 86dd1e302bb3..b021a1817666 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -53,17 +53,13 @@ extern void ctrl_alt_del(void); #define KBD_DEFMODE ((1 << VC_REPEAT) | (1 << VC_META)) -/* - * Some laptops take the 789uiojklm,. keys as number pad when NumLock is on. - * This seems a good reason to start with NumLock off. On HIL keyboards - * of PARISC machines however there is no NumLock key and everyone expects the - * keypad to be used for numbers. - */ - -#if defined(CONFIG_PARISC) && (defined(CONFIG_KEYBOARD_HIL) || defined(CONFIG_KEYBOARD_HIL_OLD)) -#define KBD_DEFLEDS (1 << VC_NUMLOCK) +#if defined(CONFIG_X86) || defined(CONFIG_PARISC) +#include #else -#define KBD_DEFLEDS 0 +static inline int kbd_defleds(void) +{ + return 0; +} #endif #define KBD_DEFLOCK 0 @@ -1512,8 +1508,8 @@ int __init kbd_init(void) int error; for (i = 0; i < MAX_NR_CONSOLES; i++) { - kbd_table[i].ledflagstate = KBD_DEFLEDS; - kbd_table[i].default_ledflagstate = KBD_DEFLEDS; + kbd_table[i].ledflagstate = kbd_defleds(); + kbd_table[i].default_ledflagstate = kbd_defleds(); kbd_table[i].ledmode = LED_SHOW_FLAGS; kbd_table[i].lockstate = KBD_DEFLOCK; kbd_table[i].slockstate = 0; -- cgit v1.2.3 From f2604c141a00c00b92b7fd2f9d2455517fdd6c15 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:44 +0300 Subject: x86, realmode: move relocs from scripts/ to arch/x86/tools Moved relocs tool from scripts/ to arch/x86/tools because it is architecture specific script. Added new target archscripts that can be used to build scripts needed building an architecture. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-22-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin Cc: Sam Ravnborg Cc: Michal Marek --- Makefile | 9 +- arch/x86/Makefile | 3 + arch/x86/boot/compressed/Makefile | 4 +- arch/x86/realmode/rm/Makefile | 2 +- arch/x86/tools/.gitignore | 1 + arch/x86/tools/Makefile | 4 + arch/x86/tools/relocs.c | 804 ++++++++++++++++++++++++++++++++++++++ scripts/Makefile | 1 - scripts/x86-relocs.c | 804 -------------------------------------- 9 files changed, 821 insertions(+), 811 deletions(-) create mode 100644 arch/x86/tools/.gitignore create mode 100644 arch/x86/tools/relocs.c delete mode 100644 scripts/x86-relocs.c (limited to 'arch/x86') diff --git a/Makefile b/Makefile index 9e384ae6c403..ed1bd29dc03c 100644 --- a/Makefile +++ b/Makefile @@ -442,7 +442,7 @@ asm-generic: no-dot-config-targets := clean mrproper distclean \ cscope gtags TAGS tags help %docs check% coccicheck \ - include/linux/version.h headers_% archheaders \ + include/linux/version.h headers_% archheaders archscripts \ kernelversion %src-pkg config-targets := 0 @@ -979,7 +979,7 @@ prepare1: prepare2 include/linux/version.h include/generated/utsrelease.h \ include/config/auto.conf $(cmd_crmodverdir) -archprepare: archheaders prepare1 scripts_basic +archprepare: archheaders archscripts prepare1 scripts_basic prepare0: archprepare FORCE $(Q)$(MAKE) $(build)=. @@ -1049,8 +1049,11 @@ hdr-dst = $(if $(KBUILD_HEADERS), dst=include/asm-$(hdr-arch), dst=include/asm) PHONY += archheaders archheaders: +PHONY += archscripts +archscripts: + PHONY += __headers -__headers: include/linux/version.h scripts_basic asm-generic archheaders FORCE +__headers: include/linux/version.h scripts_basic asm-generic archheaders archscripts FORCE $(Q)$(MAKE) $(build)=scripts build_unifdef PHONY += headers_install_all diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237606a3..94e91e401da9 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -134,6 +134,9 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) +archscripts: + $(Q)$(MAKE) $(build)=arch/x86/tools relocs + ### # Syscall table generation diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0435e8a2d20e..e398bb5d63bb 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -42,8 +42,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += vmlinux.bin.all vmlinux.relocs -CMD_RELOCS = scripts/x86-relocs -quiet_cmd_relocs = RELOCS $@ +CMD_RELOCS = arch/x86/tools/relocs +quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< $(obj)/vmlinux.relocs: vmlinux FORCE $(call if_changed,relocs) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index fc8854b09dfa..de40bc44b92f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -52,7 +52,7 @@ $(obj)/realmode.bin: $(obj)/realmode.elf $(call if_changed,objcopy) quiet_cmd_relocs = RELOCS $@ - cmd_relocs = scripts/x86-relocs --realmode $< > $@ + cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE $(call if_changed,relocs) diff --git a/arch/x86/tools/.gitignore b/arch/x86/tools/.gitignore new file mode 100644 index 000000000000..be0ed065249b --- /dev/null +++ b/arch/x86/tools/.gitignore @@ -0,0 +1 @@ +relocs diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index d511aa97533a..733057b435b0 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -36,3 +36,7 @@ HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c $(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include +hostprogs-y += relocs +relocs: $(obj)/relocs diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c new file mode 100644 index 000000000000..74e16bb15dc4 --- /dev/null +++ b/arch/x86/tools/relocs.c @@ -0,0 +1,804 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define USE_BSD +#include +#include +#include + +static void die(char *fmt, ...); + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +static Elf32_Ehdr ehdr; +static unsigned long reloc_count, reloc_idx; +static unsigned long *relocs; +static unsigned long reloc16_count, reloc16_idx; +static unsigned long *relocs16; + +struct section { + Elf32_Shdr shdr; + struct section *link; + Elf32_Sym *symtab; + Elf32_Rel *reltab; + char *strtab; +}; +static struct section *secs; + +enum symtype { + S_ABS, + S_REL, + S_SEG, + S_LIN, + S_NSYMTYPES +}; + +static const char * const sym_regex_kernel[S_NSYMTYPES] = { +/* + * Following symbols have been audited. There values are constant and do + * not change if bzImage is loaded at a different physical address than + * the address for which it has been compiled. Don't warn user about + * absolute relocations present w.r.t these symbols. + */ + [S_ABS] = + "^(xen_irq_disable_direct_reloc$|" + "xen_save_fl_direct_reloc$|" + "VDSO|" + "__crc_)", + +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ + [S_REL] = + "^_end$", +}; + + +static const char * const sym_regex_realmode[S_NSYMTYPES] = { +/* + * These symbols are known to be relative, even if the linker marks them + * as absolute (typically defined outside any section in the linker script.) + */ + [S_REL] = + "^pa_", + +/* + * These are 16-bit segment symbols when compiling 16-bit code. + */ + [S_SEG] = + "^real_mode_seg$", + +/* + * These are offsets belonging to segments, as opposed to linear addresses, + * when compiling 16-bit code. + */ + [S_LIN] = + "^pa_", +}; + +static const char * const *sym_regex; + +static regex_t sym_regex_c[S_NSYMTYPES]; +static int is_reloc(enum symtype type, const char *sym_name) +{ + return sym_regex[type] && + !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); +} + +static void regex_init(int use_real_mode) +{ + char errbuf[128]; + int err; + int i; + + if (use_real_mode) + sym_regex = sym_regex_realmode; + else + sym_regex = sym_regex_kernel; + + for (i = 0; i < S_NSYMTYPES; i++) { + if (!sym_regex[i]) + continue; + + err = regcomp(&sym_regex_c[i], sym_regex[i], + REG_EXTENDED|REG_NOSUB); + + if (err) { + regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf); + die("%s", errbuf); + } + } +} + +static void die(char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(1); +} + +static const char *sym_type(unsigned type) +{ + static const char *type_name[] = { +#define SYM_TYPE(X) [X] = #X + SYM_TYPE(STT_NOTYPE), + SYM_TYPE(STT_OBJECT), + SYM_TYPE(STT_FUNC), + SYM_TYPE(STT_SECTION), + SYM_TYPE(STT_FILE), + SYM_TYPE(STT_COMMON), + SYM_TYPE(STT_TLS), +#undef SYM_TYPE + }; + const char *name = "unknown sym type name"; + if (type < ARRAY_SIZE(type_name)) { + name = type_name[type]; + } + return name; +} + +static const char *sym_bind(unsigned bind) +{ + static const char *bind_name[] = { +#define SYM_BIND(X) [X] = #X + SYM_BIND(STB_LOCAL), + SYM_BIND(STB_GLOBAL), + SYM_BIND(STB_WEAK), +#undef SYM_BIND + }; + const char *name = "unknown sym bind name"; + if (bind < ARRAY_SIZE(bind_name)) { + name = bind_name[bind]; + } + return name; +} + +static const char *sym_visibility(unsigned visibility) +{ + static const char *visibility_name[] = { +#define SYM_VISIBILITY(X) [X] = #X + SYM_VISIBILITY(STV_DEFAULT), + SYM_VISIBILITY(STV_INTERNAL), + SYM_VISIBILITY(STV_HIDDEN), + SYM_VISIBILITY(STV_PROTECTED), +#undef SYM_VISIBILITY + }; + const char *name = "unknown sym visibility name"; + if (visibility < ARRAY_SIZE(visibility_name)) { + name = visibility_name[visibility]; + } + return name; +} + +static const char *rel_type(unsigned type) +{ + static const char *type_name[] = { +#define REL_TYPE(X) [X] = #X + REL_TYPE(R_386_NONE), + REL_TYPE(R_386_32), + REL_TYPE(R_386_PC32), + REL_TYPE(R_386_GOT32), + REL_TYPE(R_386_PLT32), + REL_TYPE(R_386_COPY), + REL_TYPE(R_386_GLOB_DAT), + REL_TYPE(R_386_JMP_SLOT), + REL_TYPE(R_386_RELATIVE), + REL_TYPE(R_386_GOTOFF), + REL_TYPE(R_386_GOTPC), + REL_TYPE(R_386_8), + REL_TYPE(R_386_PC8), + REL_TYPE(R_386_16), + REL_TYPE(R_386_PC16), +#undef REL_TYPE + }; + const char *name = "unknown type rel type name"; + if (type < ARRAY_SIZE(type_name) && type_name[type]) { + name = type_name[type]; + } + return name; +} + +static const char *sec_name(unsigned shndx) +{ + const char *sec_strtab; + const char *name; + sec_strtab = secs[ehdr.e_shstrndx].strtab; + name = ""; + if (shndx < ehdr.e_shnum) { + name = sec_strtab + secs[shndx].shdr.sh_name; + } + else if (shndx == SHN_ABS) { + name = "ABSOLUTE"; + } + else if (shndx == SHN_COMMON) { + name = "COMMON"; + } + return name; +} + +static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) +{ + const char *name; + name = ""; + if (sym->st_name) { + name = sym_strtab + sym->st_name; + } + else { + name = sec_name(sym->st_shndx); + } + return name; +} + + + +#if BYTE_ORDER == LITTLE_ENDIAN +#define le16_to_cpu(val) (val) +#define le32_to_cpu(val) (val) +#endif +#if BYTE_ORDER == BIG_ENDIAN +#define le16_to_cpu(val) bswap_16(val) +#define le32_to_cpu(val) bswap_32(val) +#endif + +static uint16_t elf16_to_cpu(uint16_t val) +{ + return le16_to_cpu(val); +} + +static uint32_t elf32_to_cpu(uint32_t val) +{ + return le32_to_cpu(val); +} + +static void read_ehdr(FILE *fp) +{ + if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { + die("Cannot read ELF header: %s\n", + strerror(errno)); + } + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { + die("No ELF magic\n"); + } + if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { + die("Not a 32 bit executable\n"); + } + if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { + die("Not a LSB ELF executable\n"); + } + if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { + die("Unknown ELF version\n"); + } + /* Convert the fields to native endian */ + ehdr.e_type = elf16_to_cpu(ehdr.e_type); + ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); + ehdr.e_version = elf32_to_cpu(ehdr.e_version); + ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); + ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); + ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); + ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); + ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); + ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); + ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); + ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); + ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); + ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { + die("Unsupported ELF header type\n"); + } + if (ehdr.e_machine != EM_386) { + die("Not for x86\n"); + } + if (ehdr.e_version != EV_CURRENT) { + die("Unknown ELF version\n"); + } + if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { + die("Bad Elf header size\n"); + } + if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { + die("Bad program header entry\n"); + } + if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { + die("Bad section header entry\n"); + } + if (ehdr.e_shstrndx >= ehdr.e_shnum) { + die("String table index out of bounds\n"); + } +} + +static void read_shdrs(FILE *fp) +{ + int i; + Elf32_Shdr shdr; + + secs = calloc(ehdr.e_shnum, sizeof(struct section)); + if (!secs) { + die("Unable to allocate %d section headers\n", + ehdr.e_shnum); + } + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + ehdr.e_shoff, strerror(errno)); + } + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (fread(&shdr, sizeof shdr, 1, fp) != 1) + die("Cannot read ELF section headers %d/%d: %s\n", + i, ehdr.e_shnum, strerror(errno)); + sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); + sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); + sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); + sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); + sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); + sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); + sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); + sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); + sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); + sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); + if (sec->shdr.sh_link < ehdr.e_shnum) + sec->link = &secs[sec->shdr.sh_link]; + } + +} + +static void read_strtabs(FILE *fp) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_STRTAB) { + continue; + } + sec->strtab = malloc(sec->shdr.sh_size); + if (!sec->strtab) { + die("malloc of %d bytes for strtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + } +} + +static void read_symtabs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sec->symtab = malloc(sec->shdr.sh_size); + if (!sec->symtab) { + die("malloc of %d bytes for symtab failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym = &sec->symtab[j]; + sym->st_name = elf32_to_cpu(sym->st_name); + sym->st_value = elf32_to_cpu(sym->st_value); + sym->st_size = elf32_to_cpu(sym->st_size); + sym->st_shndx = elf16_to_cpu(sym->st_shndx); + } + } +} + + +static void read_relocs(FILE *fp) +{ + int i,j; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec->reltab = malloc(sec->shdr.sh_size); + if (!sec->reltab) { + die("malloc of %d bytes for relocs failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel = &sec->reltab[j]; + rel->r_offset = elf32_to_cpu(rel->r_offset); + rel->r_info = elf32_to_cpu(rel->r_info); + } + } +} + + +static void print_absolute_symbols(void) +{ + int i; + printf("Absolute symbols\n"); + printf(" Num: Value Size Type Bind Visibility Name\n"); + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + char *sym_strtab; + int j; + + if (sec->shdr.sh_type != SHT_SYMTAB) { + continue; + } + sym_strtab = sec->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { + Elf32_Sym *sym; + const char *name; + sym = &sec->symtab[j]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + printf("%5d %08x %5d %10s %10s %12s %s\n", + j, sym->st_value, sym->st_size, + sym_type(ELF32_ST_TYPE(sym->st_info)), + sym_bind(ELF32_ST_BIND(sym->st_info)), + sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), + name); + } + } + printf("\n"); +} + +static void print_absolute_relocs(void) +{ + int i, printed = 0; + + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = &secs[i]; + struct section *sec_applies, *sec_symtab; + char *sym_strtab; + Elf32_Sym *sh_symtab; + int j; + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + const char *name; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + + /* Absolute symbols are not relocated if bzImage is + * loaded at a non-compiled address. Display a warning + * to user at compile time about the absolute + * relocations present. + * + * User need to audit the code to make sure + * some symbols which should have been section + * relative have not become absolute because of some + * linker optimization or wrong programming usage. + * + * Before warning check if this absolute symbol + * relocation is harmless. + */ + if (is_reloc(S_ABS, name) || is_reloc(S_REL, name)) + continue; + + if (!printed) { + printf("WARNING: Absolute relocations" + " present\n"); + printf("Offset Info Type Sym.Value " + "Sym.Name\n"); + printed = 1; + } + + printf("%08x %08x %10s %08x %s\n", + rel->r_offset, + rel->r_info, + rel_type(ELF32_R_TYPE(rel->r_info)), + sym->st_value, + name); + } + } + + if (printed) + printf("\n"); +} + +static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), + int use_real_mode) +{ + int i; + /* Walk through the relocations */ + for (i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + struct section *sec_applies, *sec_symtab; + int j; + struct section *sec = &secs[i]; + + if (sec->shdr.sh_type != SHT_REL) { + continue; + } + sec_symtab = sec->link; + sec_applies = &secs[sec->shdr.sh_info]; + if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = sec_symtab->symtab; + sym_strtab = sec_symtab->link->strtab; + for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + unsigned r_type; + const char *symname; + rel = &sec->reltab[j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + r_type = ELF32_R_TYPE(rel->r_info); + + switch (r_type) { + case R_386_NONE: + case R_386_PC32: + case R_386_PC16: + case R_386_PC8: + /* + * NONE can be ignored and and PC relative + * relocations don't need to be adjusted. + */ + break; + + case R_386_16: + symname = sym_name(sym_strtab, sym); + if (!use_real_mode) + goto bad; + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_SEG, symname)) + goto bad; + } else { + if (is_reloc(S_LIN, symname)) + goto bad; + else + break; + } + visit(rel, sym); + break; + + case R_386_32: + symname = sym_name(sym_strtab, sym); + if (sym->st_shndx == SHN_ABS) { + if (is_reloc(S_ABS, symname)) + break; + else if (!is_reloc(S_REL, symname)) + goto bad; + } else { + if (use_real_mode && + !is_reloc(S_LIN, symname)) + break; + } + visit(rel, sym); + break; + default: + die("Unsupported relocation type: %s (%d)\n", + rel_type(r_type), r_type); + break; + bad: + symname = sym_name(sym_strtab, sym); + die("Invalid %s relocation: %s\n", + rel_type(r_type), symname); + } + } + } +} + +static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + reloc16_count++; + else + reloc_count++; +} + +static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + /* Remember the address that needs to be adjusted. */ + if (ELF32_R_TYPE(rel->r_info) == R_386_16) + relocs16[reloc16_idx++] = rel->r_offset; + else + relocs[reloc_idx++] = rel->r_offset; +} + +static int cmp_relocs(const void *va, const void *vb) +{ + const unsigned long *a, *b; + a = va; b = vb; + return (*a == *b)? 0 : (*a > *b)? 1 : -1; +} + +static int write32(unsigned int v, FILE *f) +{ + unsigned char buf[4]; + + put_unaligned_le32(v, buf); + return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode) +{ + int i; + /* Count how many relocations I have and allocate space for them. */ + reloc_count = 0; + walk_relocs(count_reloc, use_real_mode); + relocs = malloc(reloc_count * sizeof(relocs[0])); + if (!relocs) { + die("malloc of %d entries for relocs failed\n", + reloc_count); + } + + relocs16 = malloc(reloc16_count * sizeof(relocs[0])); + if (!relocs16) { + die("malloc of %d entries for relocs16 failed\n", + reloc16_count); + } + /* Collect up the relocations */ + reloc_idx = 0; + walk_relocs(collect_reloc, use_real_mode); + + if (reloc16_count && !use_real_mode) + die("Segment relocations found but --realmode not specified\n"); + + /* Order the relocations for more efficient processing */ + qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); + qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs); + + /* Print the relocations */ + if (as_text) { + /* Print the relocations in a form suitable that + * gas will like. + */ + printf(".section \".data.reloc\",\"a\"\n"); + printf(".balign 4\n"); + if (use_real_mode) { + printf("\t.long %lu\n", reloc16_count); + for (i = 0; i < reloc16_count; i++) + printf("\t.long 0x%08lx\n", relocs16[i]); + printf("\t.long %lu\n", reloc_count); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } else { + /* Print a stop */ + printf("\t.long 0x%08lx\n", (unsigned long)0); + for (i = 0; i < reloc_count; i++) { + printf("\t.long 0x%08lx\n", relocs[i]); + } + } + + printf("\n"); + } + else { + if (use_real_mode) { + write32(reloc16_count, stdout); + for (i = 0; i < reloc16_count; i++) + write32(relocs16[i], stdout); + write32(reloc_count, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) + write32(relocs[i], stdout); + } else { + /* Print a stop */ + write32(0, stdout); + + /* Now print each relocation */ + for (i = 0; i < reloc_count; i++) { + write32(relocs[i], stdout); + } + } + } +} + +static void usage(void) +{ + die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); +} + +int main(int argc, char **argv) +{ + int show_absolute_syms, show_absolute_relocs; + int as_text, use_real_mode; + const char *fname; + FILE *fp; + int i; + + show_absolute_syms = 0; + show_absolute_relocs = 0; + as_text = 0; + use_real_mode = 0; + fname = NULL; + for (i = 1; i < argc; i++) { + char *arg = argv[i]; + if (*arg == '-') { + if (strcmp(arg, "--abs-syms") == 0) { + show_absolute_syms = 1; + continue; + } + if (strcmp(arg, "--abs-relocs") == 0) { + show_absolute_relocs = 1; + continue; + } + if (strcmp(arg, "--text") == 0) { + as_text = 1; + continue; + } + if (strcmp(arg, "--realmode") == 0) { + use_real_mode = 1; + continue; + } + } + else if (!fname) { + fname = arg; + continue; + } + usage(); + } + if (!fname) { + usage(); + } + regex_init(use_real_mode); + fp = fopen(fname, "r"); + if (!fp) { + die("Cannot open %s: %s\n", + fname, strerror(errno)); + } + read_ehdr(fp); + read_shdrs(fp); + read_strtabs(fp); + read_symtabs(fp); + read_relocs(fp); + if (show_absolute_syms) { + print_absolute_symbols(); + return 0; + } + if (show_absolute_relocs) { + print_absolute_relocs(); + return 0; + } + emit_relocs(as_text, use_real_mode); + return 0; +} diff --git a/scripts/Makefile b/scripts/Makefile index a241359d2c82..36266665dbcb 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -15,7 +15,6 @@ hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash hostprogs-$(CONFIG_IKCONFIG) += bin2c hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount -hostprogs-$(CONFIG_X86) += x86-relocs always := $(hostprogs-y) $(hostprogs-m) diff --git a/scripts/x86-relocs.c b/scripts/x86-relocs.c deleted file mode 100644 index 74e16bb15dc4..000000000000 --- a/scripts/x86-relocs.c +++ /dev/null @@ -1,804 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define USE_BSD -#include -#include -#include - -static void die(char *fmt, ...); - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -static Elf32_Ehdr ehdr; -static unsigned long reloc_count, reloc_idx; -static unsigned long *relocs; -static unsigned long reloc16_count, reloc16_idx; -static unsigned long *relocs16; - -struct section { - Elf32_Shdr shdr; - struct section *link; - Elf32_Sym *symtab; - Elf32_Rel *reltab; - char *strtab; -}; -static struct section *secs; - -enum symtype { - S_ABS, - S_REL, - S_SEG, - S_LIN, - S_NSYMTYPES -}; - -static const char * const sym_regex_kernel[S_NSYMTYPES] = { -/* - * Following symbols have been audited. There values are constant and do - * not change if bzImage is loaded at a different physical address than - * the address for which it has been compiled. Don't warn user about - * absolute relocations present w.r.t these symbols. - */ - [S_ABS] = - "^(xen_irq_disable_direct_reloc$|" - "xen_save_fl_direct_reloc$|" - "VDSO|" - "__crc_)", - -/* - * These symbols are known to be relative, even if the linker marks them - * as absolute (typically defined outside any section in the linker script.) - */ - [S_REL] = - "^_end$", -}; - - -static const char * const sym_regex_realmode[S_NSYMTYPES] = { -/* - * These symbols are known to be relative, even if the linker marks them - * as absolute (typically defined outside any section in the linker script.) - */ - [S_REL] = - "^pa_", - -/* - * These are 16-bit segment symbols when compiling 16-bit code. - */ - [S_SEG] = - "^real_mode_seg$", - -/* - * These are offsets belonging to segments, as opposed to linear addresses, - * when compiling 16-bit code. - */ - [S_LIN] = - "^pa_", -}; - -static const char * const *sym_regex; - -static regex_t sym_regex_c[S_NSYMTYPES]; -static int is_reloc(enum symtype type, const char *sym_name) -{ - return sym_regex[type] && - !regexec(&sym_regex_c[type], sym_name, 0, NULL, 0); -} - -static void regex_init(int use_real_mode) -{ - char errbuf[128]; - int err; - int i; - - if (use_real_mode) - sym_regex = sym_regex_realmode; - else - sym_regex = sym_regex_kernel; - - for (i = 0; i < S_NSYMTYPES; i++) { - if (!sym_regex[i]) - continue; - - err = regcomp(&sym_regex_c[i], sym_regex[i], - REG_EXTENDED|REG_NOSUB); - - if (err) { - regerror(err, &sym_regex_c[i], errbuf, sizeof errbuf); - die("%s", errbuf); - } - } -} - -static void die(char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - vfprintf(stderr, fmt, ap); - va_end(ap); - exit(1); -} - -static const char *sym_type(unsigned type) -{ - static const char *type_name[] = { -#define SYM_TYPE(X) [X] = #X - SYM_TYPE(STT_NOTYPE), - SYM_TYPE(STT_OBJECT), - SYM_TYPE(STT_FUNC), - SYM_TYPE(STT_SECTION), - SYM_TYPE(STT_FILE), - SYM_TYPE(STT_COMMON), - SYM_TYPE(STT_TLS), -#undef SYM_TYPE - }; - const char *name = "unknown sym type name"; - if (type < ARRAY_SIZE(type_name)) { - name = type_name[type]; - } - return name; -} - -static const char *sym_bind(unsigned bind) -{ - static const char *bind_name[] = { -#define SYM_BIND(X) [X] = #X - SYM_BIND(STB_LOCAL), - SYM_BIND(STB_GLOBAL), - SYM_BIND(STB_WEAK), -#undef SYM_BIND - }; - const char *name = "unknown sym bind name"; - if (bind < ARRAY_SIZE(bind_name)) { - name = bind_name[bind]; - } - return name; -} - -static const char *sym_visibility(unsigned visibility) -{ - static const char *visibility_name[] = { -#define SYM_VISIBILITY(X) [X] = #X - SYM_VISIBILITY(STV_DEFAULT), - SYM_VISIBILITY(STV_INTERNAL), - SYM_VISIBILITY(STV_HIDDEN), - SYM_VISIBILITY(STV_PROTECTED), -#undef SYM_VISIBILITY - }; - const char *name = "unknown sym visibility name"; - if (visibility < ARRAY_SIZE(visibility_name)) { - name = visibility_name[visibility]; - } - return name; -} - -static const char *rel_type(unsigned type) -{ - static const char *type_name[] = { -#define REL_TYPE(X) [X] = #X - REL_TYPE(R_386_NONE), - REL_TYPE(R_386_32), - REL_TYPE(R_386_PC32), - REL_TYPE(R_386_GOT32), - REL_TYPE(R_386_PLT32), - REL_TYPE(R_386_COPY), - REL_TYPE(R_386_GLOB_DAT), - REL_TYPE(R_386_JMP_SLOT), - REL_TYPE(R_386_RELATIVE), - REL_TYPE(R_386_GOTOFF), - REL_TYPE(R_386_GOTPC), - REL_TYPE(R_386_8), - REL_TYPE(R_386_PC8), - REL_TYPE(R_386_16), - REL_TYPE(R_386_PC16), -#undef REL_TYPE - }; - const char *name = "unknown type rel type name"; - if (type < ARRAY_SIZE(type_name) && type_name[type]) { - name = type_name[type]; - } - return name; -} - -static const char *sec_name(unsigned shndx) -{ - const char *sec_strtab; - const char *name; - sec_strtab = secs[ehdr.e_shstrndx].strtab; - name = ""; - if (shndx < ehdr.e_shnum) { - name = sec_strtab + secs[shndx].shdr.sh_name; - } - else if (shndx == SHN_ABS) { - name = "ABSOLUTE"; - } - else if (shndx == SHN_COMMON) { - name = "COMMON"; - } - return name; -} - -static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) -{ - const char *name; - name = ""; - if (sym->st_name) { - name = sym_strtab + sym->st_name; - } - else { - name = sec_name(sym->st_shndx); - } - return name; -} - - - -#if BYTE_ORDER == LITTLE_ENDIAN -#define le16_to_cpu(val) (val) -#define le32_to_cpu(val) (val) -#endif -#if BYTE_ORDER == BIG_ENDIAN -#define le16_to_cpu(val) bswap_16(val) -#define le32_to_cpu(val) bswap_32(val) -#endif - -static uint16_t elf16_to_cpu(uint16_t val) -{ - return le16_to_cpu(val); -} - -static uint32_t elf32_to_cpu(uint32_t val) -{ - return le32_to_cpu(val); -} - -static void read_ehdr(FILE *fp) -{ - if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { - die("Cannot read ELF header: %s\n", - strerror(errno)); - } - if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0) { - die("No ELF magic\n"); - } - if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { - die("Not a 32 bit executable\n"); - } - if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { - die("Not a LSB ELF executable\n"); - } - if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { - die("Unknown ELF version\n"); - } - /* Convert the fields to native endian */ - ehdr.e_type = elf16_to_cpu(ehdr.e_type); - ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); - ehdr.e_version = elf32_to_cpu(ehdr.e_version); - ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); - ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); - ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); - ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); - ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); - ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); - ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); - ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); - ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); - ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); - - if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { - die("Unsupported ELF header type\n"); - } - if (ehdr.e_machine != EM_386) { - die("Not for x86\n"); - } - if (ehdr.e_version != EV_CURRENT) { - die("Unknown ELF version\n"); - } - if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { - die("Bad Elf header size\n"); - } - if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { - die("Bad program header entry\n"); - } - if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { - die("Bad section header entry\n"); - } - if (ehdr.e_shstrndx >= ehdr.e_shnum) { - die("String table index out of bounds\n"); - } -} - -static void read_shdrs(FILE *fp) -{ - int i; - Elf32_Shdr shdr; - - secs = calloc(ehdr.e_shnum, sizeof(struct section)); - if (!secs) { - die("Unable to allocate %d section headers\n", - ehdr.e_shnum); - } - if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - ehdr.e_shoff, strerror(errno)); - } - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (fread(&shdr, sizeof shdr, 1, fp) != 1) - die("Cannot read ELF section headers %d/%d: %s\n", - i, ehdr.e_shnum, strerror(errno)); - sec->shdr.sh_name = elf32_to_cpu(shdr.sh_name); - sec->shdr.sh_type = elf32_to_cpu(shdr.sh_type); - sec->shdr.sh_flags = elf32_to_cpu(shdr.sh_flags); - sec->shdr.sh_addr = elf32_to_cpu(shdr.sh_addr); - sec->shdr.sh_offset = elf32_to_cpu(shdr.sh_offset); - sec->shdr.sh_size = elf32_to_cpu(shdr.sh_size); - sec->shdr.sh_link = elf32_to_cpu(shdr.sh_link); - sec->shdr.sh_info = elf32_to_cpu(shdr.sh_info); - sec->shdr.sh_addralign = elf32_to_cpu(shdr.sh_addralign); - sec->shdr.sh_entsize = elf32_to_cpu(shdr.sh_entsize); - if (sec->shdr.sh_link < ehdr.e_shnum) - sec->link = &secs[sec->shdr.sh_link]; - } - -} - -static void read_strtabs(FILE *fp) -{ - int i; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_STRTAB) { - continue; - } - sec->strtab = malloc(sec->shdr.sh_size); - if (!sec->strtab) { - die("malloc of %d bytes for strtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->strtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - } -} - -static void read_symtabs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sec->symtab = malloc(sec->shdr.sh_size); - if (!sec->symtab) { - die("malloc of %d bytes for symtab failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->symtab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym = &sec->symtab[j]; - sym->st_name = elf32_to_cpu(sym->st_name); - sym->st_value = elf32_to_cpu(sym->st_value); - sym->st_size = elf32_to_cpu(sym->st_size); - sym->st_shndx = elf16_to_cpu(sym->st_shndx); - } - } -} - - -static void read_relocs(FILE *fp) -{ - int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec->reltab = malloc(sec->shdr.sh_size); - if (!sec->reltab) { - die("malloc of %d bytes for relocs failed\n", - sec->shdr.sh_size); - } - if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { - die("Seek to %d failed: %s\n", - sec->shdr.sh_offset, strerror(errno)); - } - if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) - != sec->shdr.sh_size) { - die("Cannot read symbol table: %s\n", - strerror(errno)); - } - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel = &sec->reltab[j]; - rel->r_offset = elf32_to_cpu(rel->r_offset); - rel->r_info = elf32_to_cpu(rel->r_info); - } - } -} - - -static void print_absolute_symbols(void) -{ - int i; - printf("Absolute symbols\n"); - printf(" Num: Value Size Type Bind Visibility Name\n"); - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - char *sym_strtab; - int j; - - if (sec->shdr.sh_type != SHT_SYMTAB) { - continue; - } - sym_strtab = sec->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Sym); j++) { - Elf32_Sym *sym; - const char *name; - sym = &sec->symtab[j]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - printf("%5d %08x %5d %10s %10s %12s %s\n", - j, sym->st_value, sym->st_size, - sym_type(ELF32_ST_TYPE(sym->st_info)), - sym_bind(ELF32_ST_BIND(sym->st_info)), - sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), - name); - } - } - printf("\n"); -} - -static void print_absolute_relocs(void) -{ - int i, printed = 0; - - for (i = 0; i < ehdr.e_shnum; i++) { - struct section *sec = &secs[i]; - struct section *sec_applies, *sec_symtab; - char *sym_strtab; - Elf32_Sym *sh_symtab; - int j; - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - const char *name; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - name = sym_name(sym_strtab, sym); - if (sym->st_shndx != SHN_ABS) { - continue; - } - - /* Absolute symbols are not relocated if bzImage is - * loaded at a non-compiled address. Display a warning - * to user at compile time about the absolute - * relocations present. - * - * User need to audit the code to make sure - * some symbols which should have been section - * relative have not become absolute because of some - * linker optimization or wrong programming usage. - * - * Before warning check if this absolute symbol - * relocation is harmless. - */ - if (is_reloc(S_ABS, name) || is_reloc(S_REL, name)) - continue; - - if (!printed) { - printf("WARNING: Absolute relocations" - " present\n"); - printf("Offset Info Type Sym.Value " - "Sym.Name\n"); - printed = 1; - } - - printf("%08x %08x %10s %08x %s\n", - rel->r_offset, - rel->r_info, - rel_type(ELF32_R_TYPE(rel->r_info)), - sym->st_value, - name); - } - } - - if (printed) - printf("\n"); -} - -static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), - int use_real_mode) -{ - int i; - /* Walk through the relocations */ - for (i = 0; i < ehdr.e_shnum; i++) { - char *sym_strtab; - Elf32_Sym *sh_symtab; - struct section *sec_applies, *sec_symtab; - int j; - struct section *sec = &secs[i]; - - if (sec->shdr.sh_type != SHT_REL) { - continue; - } - sec_symtab = sec->link; - sec_applies = &secs[sec->shdr.sh_info]; - if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) { - continue; - } - sh_symtab = sec_symtab->symtab; - sym_strtab = sec_symtab->link->strtab; - for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { - Elf32_Rel *rel; - Elf32_Sym *sym; - unsigned r_type; - const char *symname; - rel = &sec->reltab[j]; - sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; - r_type = ELF32_R_TYPE(rel->r_info); - - switch (r_type) { - case R_386_NONE: - case R_386_PC32: - case R_386_PC16: - case R_386_PC8: - /* - * NONE can be ignored and and PC relative - * relocations don't need to be adjusted. - */ - break; - - case R_386_16: - symname = sym_name(sym_strtab, sym); - if (!use_real_mode) - goto bad; - if (sym->st_shndx == SHN_ABS) { - if (is_reloc(S_ABS, symname)) - break; - else if (!is_reloc(S_SEG, symname)) - goto bad; - } else { - if (is_reloc(S_LIN, symname)) - goto bad; - else - break; - } - visit(rel, sym); - break; - - case R_386_32: - symname = sym_name(sym_strtab, sym); - if (sym->st_shndx == SHN_ABS) { - if (is_reloc(S_ABS, symname)) - break; - else if (!is_reloc(S_REL, symname)) - goto bad; - } else { - if (use_real_mode && - !is_reloc(S_LIN, symname)) - break; - } - visit(rel, sym); - break; - default: - die("Unsupported relocation type: %s (%d)\n", - rel_type(r_type), r_type); - break; - bad: - symname = sym_name(sym_strtab, sym); - die("Invalid %s relocation: %s\n", - rel_type(r_type), symname); - } - } - } -} - -static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - if (ELF32_R_TYPE(rel->r_info) == R_386_16) - reloc16_count++; - else - reloc_count++; -} - -static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) -{ - /* Remember the address that needs to be adjusted. */ - if (ELF32_R_TYPE(rel->r_info) == R_386_16) - relocs16[reloc16_idx++] = rel->r_offset; - else - relocs[reloc_idx++] = rel->r_offset; -} - -static int cmp_relocs(const void *va, const void *vb) -{ - const unsigned long *a, *b; - a = va; b = vb; - return (*a == *b)? 0 : (*a > *b)? 1 : -1; -} - -static int write32(unsigned int v, FILE *f) -{ - unsigned char buf[4]; - - put_unaligned_le32(v, buf); - return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; -} - -static void emit_relocs(int as_text, int use_real_mode) -{ - int i; - /* Count how many relocations I have and allocate space for them. */ - reloc_count = 0; - walk_relocs(count_reloc, use_real_mode); - relocs = malloc(reloc_count * sizeof(relocs[0])); - if (!relocs) { - die("malloc of %d entries for relocs failed\n", - reloc_count); - } - - relocs16 = malloc(reloc16_count * sizeof(relocs[0])); - if (!relocs16) { - die("malloc of %d entries for relocs16 failed\n", - reloc16_count); - } - /* Collect up the relocations */ - reloc_idx = 0; - walk_relocs(collect_reloc, use_real_mode); - - if (reloc16_count && !use_real_mode) - die("Segment relocations found but --realmode not specified\n"); - - /* Order the relocations for more efficient processing */ - qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); - qsort(relocs16, reloc16_count, sizeof(relocs16[0]), cmp_relocs); - - /* Print the relocations */ - if (as_text) { - /* Print the relocations in a form suitable that - * gas will like. - */ - printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - if (use_real_mode) { - printf("\t.long %lu\n", reloc16_count); - for (i = 0; i < reloc16_count; i++) - printf("\t.long 0x%08lx\n", relocs16[i]); - printf("\t.long %lu\n", reloc_count); - for (i = 0; i < reloc_count; i++) { - printf("\t.long 0x%08lx\n", relocs[i]); - } - } else { - /* Print a stop */ - printf("\t.long 0x%08lx\n", (unsigned long)0); - for (i = 0; i < reloc_count; i++) { - printf("\t.long 0x%08lx\n", relocs[i]); - } - } - - printf("\n"); - } - else { - if (use_real_mode) { - write32(reloc16_count, stdout); - for (i = 0; i < reloc16_count; i++) - write32(relocs16[i], stdout); - write32(reloc_count, stdout); - - /* Now print each relocation */ - for (i = 0; i < reloc_count; i++) - write32(relocs[i], stdout); - } else { - /* Print a stop */ - write32(0, stdout); - - /* Now print each relocation */ - for (i = 0; i < reloc_count; i++) { - write32(relocs[i], stdout); - } - } - } -} - -static void usage(void) -{ - die("relocs [--abs-syms|--abs-relocs|--text|--realmode] vmlinux\n"); -} - -int main(int argc, char **argv) -{ - int show_absolute_syms, show_absolute_relocs; - int as_text, use_real_mode; - const char *fname; - FILE *fp; - int i; - - show_absolute_syms = 0; - show_absolute_relocs = 0; - as_text = 0; - use_real_mode = 0; - fname = NULL; - for (i = 1; i < argc; i++) { - char *arg = argv[i]; - if (*arg == '-') { - if (strcmp(arg, "--abs-syms") == 0) { - show_absolute_syms = 1; - continue; - } - if (strcmp(arg, "--abs-relocs") == 0) { - show_absolute_relocs = 1; - continue; - } - if (strcmp(arg, "--text") == 0) { - as_text = 1; - continue; - } - if (strcmp(arg, "--realmode") == 0) { - use_real_mode = 1; - continue; - } - } - else if (!fname) { - fname = arg; - continue; - } - usage(); - } - if (!fname) { - usage(); - } - regex_init(use_real_mode); - fp = fopen(fname, "r"); - if (!fp) { - die("Cannot open %s: %s\n", - fname, strerror(errno)); - } - read_ehdr(fp); - read_shdrs(fp); - read_strtabs(fp); - read_symtabs(fp); - read_relocs(fp); - if (show_absolute_syms) { - print_absolute_symbols(); - return 0; - } - if (show_absolute_relocs) { - print_absolute_relocs(); - return 0; - } - emit_relocs(as_text, use_real_mode); - return 0; -} -- cgit v1.2.3 From bf8b88e97716feb750c3d34492f00d9c085e1183 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:45 +0300 Subject: x86, realmode: fixes compilation issue in tboot.c Fixed include path of wakeup.h in tboot.c. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-23-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 65adda4fde93..f84fe00fad48 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -44,7 +44,7 @@ #include #include -#include "acpi/realmode/wakeup.h" +#include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; -- cgit v1.2.3 From cda846f101fb1396b6924f1d9b68ac3d42de5403 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 8 May 2012 21:22:46 +0300 Subject: x86, realmode: read cr4 and EFER from kernel for 64-bit trampoline This patch changes 64-bit trampoline so that CR4 and EFER are provided by the kernel instead of using fixed values. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336501366-28617-24-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 7 ++++++- arch/x86/include/asm/realmode.h | 8 ++++++-- arch/x86/kernel/realmode.c | 8 ++++++++ arch/x86/kernel/setup.c | 2 ++ arch/x86/realmode/rm/header.S | 1 + arch/x86/realmode/rm/trampoline_64.S | 32 +++++++------------------------- arch/x86/realmode/rm/trampoline_common.S | 19 +++++++++++++++++++ 7 files changed, 49 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4fa7dcceb6c0..404583ccf0cf 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -544,13 +544,16 @@ static inline void load_sp0(struct tss_struct *tss, * enable), so that any CPU's that boot up * after us can get the correct flags. */ -extern unsigned long mmu_cr4_features; +extern unsigned long mmu_cr4_features; +extern u32 *trampoline_cr4_features; static inline void set_in_cr4(unsigned long mask) { unsigned long cr4; mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; cr4 = read_cr4(); cr4 |= mask; write_cr4(cr4); @@ -561,6 +564,8 @@ static inline void clear_in_cr4(unsigned long mask) unsigned long cr4; mmu_cr4_features &= ~mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; cr4 = read_cr4(); cr4 &= ~mask; write_cr4(cr4); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 1421eed1c8e8..937dc6071d76 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -24,18 +24,22 @@ struct real_mode_header { #ifdef CONFIG_X86_32 u32 machine_real_restart_asm; #endif -} __attribute__((__packed__)); +}; /* This must match data at trampoline_32/64.S */ struct trampoline_header { #ifdef CONFIG_X86_32 u32 start; + u16 gdt_pad; u16 gdt_limit; u32 gdt_base; #else u64 start; + u32 cr4; + u32 efer_low; + u32 efer_high; #endif -} __attribute__((__packed__)); +}; extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 712fba8fd774..66ac276cf361 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -6,6 +6,7 @@ #include struct real_mode_header *real_mode_header; +u32 *trampoline_cr4_features; void __init setup_real_mode(void) { @@ -64,7 +65,14 @@ void __init setup_real_mode(void) trampoline_header->gdt_limit = __BOOT_DS + 7; trampoline_header->gdt_base = __pa(boot_gdt); #else + if (rdmsr_safe(MSR_EFER, &trampoline_header->efer_low, + &trampoline_header->efer_high)) + BUG(); + trampoline_header->start = (u64) secondary_startup_64; + trampoline_cr4_features = &trampoline_header->cr4; + *trampoline_cr4_features = read_cr4(); + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7a14fece9cfc..efcf305210a4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -975,6 +975,8 @@ void __init setup_arch(char **cmdline_p) if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ mmu_cr4_features = read_cr4(); + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index b4c32632bf16..4612d5382791 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -9,6 +9,7 @@ .section ".header", "a" + .balign 16 GLOBAL(real_mode_header) .long pa_text_start .long pa_ro_end diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 3f7293239365..66e26f088288 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -34,9 +34,9 @@ #include "realmode.h" .text - .balign PAGE_SIZE .code16 + .balign PAGE_SIZE ENTRY(trampoline_start) cli # We should be safe anyway wbinvd @@ -65,8 +65,8 @@ ENTRY(trampoline_start) * to 32 bit. */ - lidtl tidt # load idt with 0, 0 - lgdtl tgdt # load gdt with whatever is appropriate + lidtl tr_idt # load idt with 0, 0 + lgdtl tr_gdt # load gdt with whatever is appropriate movw $__KERNEL_DS, %dx # Data segment descriptor @@ -93,16 +93,17 @@ ENTRY(startup_32) movl %edx, %fs movl %edx, %gs - movl $X86_CR4_PAE, %eax + movl pa_tr_cr4, %eax movl %eax, %cr4 # Enable PAE mode # Setup trampoline 4 level pagetables movl $pa_trampoline_pgd, %eax movl %eax, %cr3 + # Set up EFER + movl pa_tr_efer, %eax + movl pa_tr_efer + 4, %edx movl $MSR_EFER, %ecx - movl $((1 << _EFER_LME) | (1 << _EFER_NX)), %eax # Enable Long Mode - xorl %edx, %edx wrmsr # Enable paging and in turn activate Long Mode @@ -124,23 +125,4 @@ ENTRY(startup_64) # Now jump into the kernel using virtual addresses jmpq *tr_start(%rip) - .section ".rodata","a" - .balign 16 -tidt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - - # Duplicate the global descriptor table - # so the kernel can live anywhere - .balign 16 - .globl tgdt -tgdt: - .short tgdt_end - tgdt - 1 # gdt limit - .long pa_tgdt - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tgdt_end: - #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index c3f951c468c5..cac444b942f8 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,5 +1,20 @@ .section ".rodata","a" +#ifdef CONFIG_X86_64 + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 16 + .globl tr_gdt +tr_gdt: + .short tr_gdt_end - tr_gdt - 1 # gdt limit + .long pa_tr_gdt + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tr_gdt_end: +#endif + .balign 4 tr_idt: .fill 1, 6, 0 @@ -8,12 +23,16 @@ tr_idt: .fill 1, 6, 0 .balign 4 GLOBAL(trampoline_status) .space 4 + .balign 8 GLOBAL(trampoline_header) #ifdef CONFIG_X86_32 tr_start: .space 4 + tr_gdt_pad: .space 2 tr_gdt: .space 6 #else tr_start: .space 8 + GLOBAL(tr_cr4) .space 4 + GLOBAL(tr_efer) .space 8 #endif END(trampoline_header) -- cgit v1.2.3 From 1f0459780c28491c480f7098f3ece79334ccae0a Mon Sep 17 00:00:00 2001 From: Philipp Hahn Date: Wed, 2 May 2012 18:09:35 +0200 Subject: atomic64_32.h: fix parameter naming mismatch The doc string doesn't match the parameter name, fix @p -> @v @ptr -> @v @n -> @i Signed-off-by: Philipp Hahn Signed-off-by: Jiri Kosina --- arch/x86/include/asm/atomic64_32.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 198119910da5..b154de75c90c 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -63,7 +63,7 @@ ATOMIC64_DECL(add_unless); /** * atomic64_cmpxchg - cmpxchg atomic64 variable - * @p: pointer to type atomic64_t + * @v: pointer to type atomic64_t * @o: expected value * @n: new value * @@ -98,7 +98,7 @@ static inline long long atomic64_xchg(atomic64_t *v, long long n) /** * atomic64_set - set atomic64 variable * @v: pointer to type atomic64_t - * @n: value to assign + * @i: value to assign * * Atomically sets the value of @v to @n. */ @@ -200,7 +200,7 @@ static inline long long atomic64_sub(long long i, atomic64_t *v) * atomic64_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @v: pointer to type atomic64_t - * + * * Atomically subtracts @i from @v and returns * true if the result is zero, or false for all * other cases. @@ -224,9 +224,9 @@ static inline void atomic64_inc(atomic64_t *v) /** * atomic64_dec - decrement atomic64 variable - * @ptr: pointer to type atomic64_t + * @v: pointer to type atomic64_t * - * Atomically decrements @ptr by 1. + * Atomically decrements @v by 1. */ static inline void atomic64_dec(atomic64_t *v) { -- cgit v1.2.3 From 57da8b960b9a25646a8ddb5a9c1d0b5978e69bec Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 9 May 2012 08:47:37 +0100 Subject: x86: Avoid double stack traces with show_regs() What was called show_registers() so far already showed a stack trace for kernel faults, and kernel_stack_pointer() isn't even valid to be used for faults from user mode, hence it was pointless for show_regs() to call show_trace() after show_registers(). Simply rename show_registers() to show_regs() and eliminate the old definition. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Arjan van de Ven Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/4FAA3D3902000078000826E1@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 1 - arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 2 +- arch/x86/kernel/kprobes.c | 4 ++-- arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/process.c | 6 ------ 7 files changed, 6 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index d73f1571bde7..2c37aadcbc35 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -24,7 +24,6 @@ enum die_val { extern void printk_address(unsigned long address, int reliable); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); -extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, unsigned long *sp, unsigned long bp); extern void __show_regs(struct pt_regs *regs, int all); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1b81839b6c88..40989da4bb22 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; - show_registers(regs); + show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { sp = regs->sp; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 88ec9129271d..e0b1d783daab 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs) { int i; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 17107bd6e1f0..791b76122aa8 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_registers(struct pt_regs *regs) +void show_regs(struct pt_regs *regs) { int i; unsigned long sp; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e213fc8408d2..e2f751efb7b1 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) "current sp %p does not match saved sp %p\n", stack_addr(regs), kcb->jprobe_saved_sp); printk(KERN_ERR "Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); + show_regs(saved_regs); printk(KERN_ERR "Current registers\n"); - show_registers(regs); + show_regs(regs); BUG(); } *regs = kcb->jprobe_saved_regs; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf319165..03c134544966 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -244,7 +244,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) pr_emerg( "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", reason, smp_processor_id()); - show_registers(regs); + show_regs(regs); if (panic_on_io_nmi) panic("NMI IOCK error: Not continuing"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d92a5ab6e8b..856d5bcae5b2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -105,12 +105,6 @@ void exit_thread(void) } } -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); -} - void show_regs_common(void) { const char *vendor, *product, *board; -- cgit v1.2.3 From 94c0dd3278dd3eae52eabf0fb77d472d0dd3e373 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Apr 2012 19:04:17 +0200 Subject: x86/numa: Allow specifying node_distance() for numa=fake Allows emulating more interesting NUMA configurations like a quad socket AMD Magny-Cour: "numa=fake=8:10,16,16,22,16,22,16,22, 16,10,22,16,22,16,22,16, 16,22,10,16,16,22,16,22, 22,16,16,10,22,16,22,16, 16,22,16,22,10,16,16,22, 22,16,22,16,16,10,22,16, 16,22,16,22,16,22,10,16, 22,16,22,16,22,16,16,10" Which has a non-fully-connected topology. Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Yinghai Lu Cc: x86@kernel.org Link: http://lkml.kernel.org/n/tip-e1136ef7kdffj7yf9tjhydln@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_emulation.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 53489ff6bf82..871dd8868170 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -339,9 +339,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) } else { unsigned long n; - n = simple_strtoul(emu_cmdline, NULL, 0); + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); } + if (*emu_cmdline == ':') + emu_cmdline++; if (ret < 0) goto no_emu; @@ -418,7 +420,9 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) int physj = emu_nid_to_phys[j]; int dist; - if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) + if (get_option(&emu_cmdline, &dist) == 2) + ; + else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) dist = physi == physj ? LOCAL_DISTANCE : REMOTE_DISTANCE; else -- cgit v1.2.3 From 0acbb440f06302058e1515861dd534594521e892 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Apr 2012 19:04:09 +0200 Subject: x86/numa: Hard partition cpu topology masks on node boundaries When using numa=fake= you can get weird topologies where LLCs can span nodes and other such nonsense. Cure this by hard partitioning these masks on node boundaries. Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Yinghai Lu Cc: x86@kernel.org Link: http://lkml.kernel.org/n/tip-di5vwjm96q5vrb76opwuflwx@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6e1e406038c2..edfd03a9e390 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -337,6 +337,11 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { struct cpuinfo_x86 *o = &cpu_data(i); +#ifdef CONFIG_NUMA_EMU + if (cpu_to_node(cpu) != cpu_to_node(i)) + continue; +#endif + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { if (c->phys_proc_id == o->phys_proc_id && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && @@ -360,11 +365,17 @@ void __cpuinit set_cpu_sibling_map(int cpu) } for_each_cpu(i, cpu_sibling_setup_mask) { +#ifdef CONFIG_NUMA_EMU + if (cpu_to_node(cpu) != cpu_to_node(i)) + continue; +#endif + if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); } + if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); -- cgit v1.2.3 From ad7687dde8780a0d618a3e3b5a62bb383696fc22 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 May 2012 13:31:47 +0200 Subject: x86/numa: Check for nonsensical topologies on real hw as well Instead of only checking nonsensical topologies on numa-emu, do it on real hardware as well, and print a warning. Acked-by: Peter Zijlstra Cc: Tejun Heo Cc: Yinghai Lu Cc: x86@kernel.org Link: http://lkml.kernel.org/n/tip-re15l0jqjtpz709oxozt2zoh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index edfd03a9e390..7c53d96d44ab 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -337,10 +337,10 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { struct cpuinfo_x86 *o = &cpu_data(i); -#ifdef CONFIG_NUMA_EMU - if (cpu_to_node(cpu) != cpu_to_node(i)) + if (cpu_to_node(cpu) != cpu_to_node(i)) { + WARN_ONCE(1, "sched: CPU #%d's thread-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i)); continue; -#endif + } if (cpu_has(c, X86_FEATURE_TOPOEXT)) { if (c->phys_proc_id == o->phys_proc_id && @@ -365,10 +365,10 @@ void __cpuinit set_cpu_sibling_map(int cpu) } for_each_cpu(i, cpu_sibling_setup_mask) { -#ifdef CONFIG_NUMA_EMU - if (cpu_to_node(cpu) != cpu_to_node(i)) + if (cpu_to_node(cpu) != cpu_to_node(i)) { + WARN_ONCE(1, "sched: CPU #%d's core-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i)); continue; -#endif + } if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { -- cgit v1.2.3 From cb83b629bae0327cf9f44f096adc38d150ceb913 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Apr 2012 15:49:36 +0200 Subject: sched/numa: Rewrite the CONFIG_NUMA sched domain support The current code groups up to 16 nodes in a level and then puts an ALLNODES domain spanning the entire tree on top of that. This doesn't reflect the numa topology and esp for the smaller not-fully-connected machines out there today this might make a difference. Therefore, build a proper numa topology based on node_distance(). Since there's no fixed numa layers anymore, the static SD_NODE_INIT and SD_ALLNODES_INIT aren't usable anymore, the new code tries to construct something similar and scales some values either on the number of cpus in the domain and/or the node_distance() ratio. Signed-off-by: Peter Zijlstra Cc: Anton Blanchard Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: David Howells Cc: "David S. Miller" Cc: Fenghua Yu Cc: "H. Peter Anvin" Cc: Ivan Kokshaysky Cc: linux-alpha@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org Cc: Matt Turner Cc: Paul Mackerras Cc: Paul Mundt Cc: Ralf Baechle Cc: Richard Henderson Cc: sparclinux@vger.kernel.org Cc: Tony Luck Cc: x86@kernel.org Cc: Dimitri Sivanich Cc: Greg Pearson Cc: KAMEZAWA Hiroyuki Cc: bob.picco@oracle.com Cc: chris.mason@oracle.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-r74n3n8hhuc2ynbrnp3vt954@git.kernel.org Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 25 --- arch/mips/include/asm/mach-ip27/topology.h | 17 -- arch/powerpc/include/asm/topology.h | 36 ---- arch/sh/include/asm/topology.h | 25 --- arch/sparc/include/asm/topology_64.h | 19 -- arch/tile/include/asm/topology.h | 26 --- arch/x86/include/asm/topology.h | 38 ---- include/linux/topology.h | 37 ---- kernel/sched/core.c | 280 +++++++++++++++++++---------- 9 files changed, 185 insertions(+), 318 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 09f646753d1a..a2496e449b75 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -70,31 +70,6 @@ void build_cpu_to_node_map(void); .nr_balance_failed = 0, \ } -/* sched_domains SD_NODE_INIT for IA64 NUMA machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ - .min_interval = 8, \ - .max_interval = 8*(min(num_online_cpus(), 32U)), \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 2, \ - .busy_idx = 3, \ - .idle_idx = 2, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 64, \ - .nr_balance_failed = 0, \ -} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index 1b1a7d1632b9..b2cf641f206f 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -36,23 +36,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; #define node_distance(from, to) (__node_distances[(from)][(to)]) -/* sched_domains SD_NODE_INIT for SGI IP27 machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .flags = SD_LOAD_BALANCE | \ - SD_BALANCE_EXEC, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - #include #endif /* _ASM_MACH_TOPOLOGY_H */ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c97185885c6d..852ed1b384f6 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -18,12 +18,6 @@ struct device_node; */ #define RECLAIM_DISTANCE 10 -/* - * Avoid creating an extra level of balancing (SD_ALLNODES) on the largest - * POWER7 boxes which have a maximum of 32 nodes. - */ -#define SD_NODES_PER_DOMAIN 32 - #include static inline int cpu_to_node(int cpu) @@ -51,36 +45,6 @@ static inline int pcibus_to_node(struct pci_bus *bus) cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) -/* sched_domains SD_NODE_INIT for PPC64 machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 1, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 0*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_PREFER_LOCAL \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_POWERSAVINGS_BALANCE \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 1*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ -} - extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index 88e734069fa6..b0a282d65f6a 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -3,31 +3,6 @@ #ifdef CONFIG_NUMA -/* sched_domains SD_NODE_INIT for sh machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 2, \ - .busy_idx = 3, \ - .idle_idx = 2, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_NEWIDLE \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .nr_balance_failed = 0, \ -} - #define cpu_to_node(cpu) ((void)(cpu),0) #define parent_node(node) ((void)(node),0) diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 8b9c556d630b..1754390a426f 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -31,25 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus) cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) -#define SD_NODE_INIT (struct sched_domain) { \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 2, \ - .busy_idx = 3, \ - .idle_idx = 2, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ -} - #else /* CONFIG_NUMA */ #include diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index 6fdd0c860193..7a7ce390534f 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h @@ -78,32 +78,6 @@ static inline const struct cpumask *cpumask_of_node(int node) .balance_interval = 32, \ } -/* sched_domains SD_NODE_INIT for TILE architecture */ -#define SD_NODE_INIT (struct sched_domain) { \ - .min_interval = 16, \ - .max_interval = 512, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ - .wake_idx = 1, \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ - | 0*SD_PREFER_LOCAL \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 1*SD_SERIALIZE \ - , \ - .last_balance = jiffies, \ - .balance_interval = 128, \ -} - /* By definition, we create nodes based on online memory. */ #define node_has_online_mem(nid) 1 diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index b9676ae37ada..095b21507b6a 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void); #define pcibus_to_node(bus) __pcibus_to_node(bus) -#ifdef CONFIG_X86_32 -# define SD_CACHE_NICE_TRIES 1 -# define SD_IDLE_IDX 1 -#else -# define SD_CACHE_NICE_TRIES 2 -# define SD_IDLE_IDX 2 -#endif - -/* sched_domains SD_NODE_INIT for NUMA machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = SD_CACHE_NICE_TRIES, \ - .busy_idx = 3, \ - .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ - \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 1*SD_BALANCE_EXEC \ - | 1*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ - | 0*SD_PREFER_LOCAL \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_POWERSAVINGS_BALANCE \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 1*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 1, \ -} - extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) diff --git a/include/linux/topology.h b/include/linux/topology.h index e26db031303b..4f59bf36f0af 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -70,7 +70,6 @@ int arch_update_cpu_topology(void); * Below are the 3 major initializers used in building sched_domains: * SD_SIBLING_INIT, for SMT domains * SD_CPU_INIT, for SMP domains - * SD_NODE_INIT, for NUMA domains * * Any architecture that cares to do any tuning to these values should do so * by defining their own arch-specific initializer in include/asm/topology.h. @@ -176,48 +175,12 @@ int arch_update_cpu_topology(void); } #endif -/* sched_domains SD_ALLNODES_INIT for NUMA machines */ -#define SD_ALLNODES_INIT (struct sched_domain) { \ - .min_interval = 64, \ - .max_interval = 64*num_online_cpus(), \ - .busy_factor = 128, \ - .imbalance_pct = 133, \ - .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 3, \ - .flags = 1*SD_LOAD_BALANCE \ - | 1*SD_BALANCE_NEWIDLE \ - | 0*SD_BALANCE_EXEC \ - | 0*SD_BALANCE_FORK \ - | 0*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ - | 0*SD_SHARE_CPUPOWER \ - | 0*SD_POWERSAVINGS_BALANCE \ - | 0*SD_SHARE_PKG_RESOURCES \ - | 1*SD_SERIALIZE \ - | 0*SD_PREFER_SIBLING \ - , \ - .last_balance = jiffies, \ - .balance_interval = 64, \ -} - -#ifndef SD_NODES_PER_DOMAIN -#define SD_NODES_PER_DOMAIN 16 -#endif - #ifdef CONFIG_SCHED_BOOK #ifndef SD_BOOK_INIT #error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!! #endif #endif /* CONFIG_SCHED_BOOK */ -#ifdef CONFIG_NUMA -#ifndef SD_NODE_INIT -#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! -#endif - -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6001e5c3b4e4..b4f2096980a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5560,7 +5560,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (cpumask_intersects(groupmask, sched_group_cpus(group))) { + if (!(sd->flags & SD_OVERLAP) && + cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; @@ -5898,92 +5899,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ - int i, n, val, min_val, best_node = -1; - - min_val = INT_MAX; - - for (i = 0; i < nr_node_ids; i++) { - /* Start at @node */ - n = (node + i) % nr_node_ids; - - if (!nr_cpus_node(n)) - continue; - - /* Skip already used nodes */ - if (node_isset(n, *used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - if (best_node != -1) - node_set(best_node, *used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ - nodemask_t used_nodes; - int i; - - cpumask_clear(span); - nodes_clear(used_nodes); - - cpumask_or(span, span, cpumask_of_node(node)); - node_set(node, used_nodes); - - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, &used_nodes); - if (next_node < 0) - break; - cpumask_or(span, span, cpumask_of_node(next_node)); - } -} - -static const struct cpumask *cpu_node_mask(int cpu) -{ - lockdep_assert_held(&sched_domains_mutex); - - sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); - - return sched_domains_tmpmask; -} - -static const struct cpumask *cpu_allnodes_mask(int cpu) -{ - return cpu_possible_mask; -} -#endif /* CONFIG_NUMA */ - static const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); @@ -6020,6 +5935,7 @@ struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; int flags; + int numa_level; struct sd_data data; }; @@ -6213,10 +6129,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ } SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif #ifdef CONFIG_SCHED_SMT SD_INIT_FUNC(SIBLING) #endif @@ -6338,15 +6250,191 @@ static struct sched_domain_topology_level default_topology[] = { { sd_init_BOOK, cpu_book_mask, }, #endif { sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, - { sd_init_ALLNODES, cpu_allnodes_mask, }, -#endif { NULL, }, }; static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_NUMA + +static int sched_domains_numa_levels; +static int sched_domains_numa_scale; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; + +static inline unsigned long numa_scale(unsigned long x, int level) +{ + return x * sched_domains_numa_distance[level] / sched_domains_numa_scale; +} + +static inline int sd_local_flags(int level) +{ + if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + return 0; + + return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; +} + +static struct sched_domain * +sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); + int level = tl->numa_level; + int sd_weight = cpumask_weight( + sched_domains_numa_masks[level][cpu_to_node(cpu)]); + + *sd = (struct sched_domain){ + .min_interval = sd_weight, + .max_interval = 2*sd_weight, + .busy_factor = 32, + .imbalance_pct = 100 + numa_scale(25, level), + .cache_nice_tries = 2, + .busy_idx = 3, + .idle_idx = 2, + .newidle_idx = 0, + .wake_idx = 0, + .forkexec_idx = 0, + + .flags = 1*SD_LOAD_BALANCE + | 1*SD_BALANCE_NEWIDLE + | 0*SD_BALANCE_EXEC + | 0*SD_BALANCE_FORK + | 0*SD_BALANCE_WAKE + | 0*SD_WAKE_AFFINE + | 0*SD_PREFER_LOCAL + | 0*SD_SHARE_CPUPOWER + | 0*SD_POWERSAVINGS_BALANCE + | 0*SD_SHARE_PKG_RESOURCES + | 1*SD_SERIALIZE + | 0*SD_PREFER_SIBLING + | sd_local_flags(level) + , + .last_balance = jiffies, + .balance_interval = sd_weight, + }; + SD_INIT_NAME(sd, NUMA); + sd->private = &tl->data; + + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; + + return sd; +} + +static const struct cpumask *sd_numa_mask(int cpu) +{ + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_init_numa(void) +{ + int next_distance, curr_distance = node_distance(0, 0); + struct sched_domain_topology_level *tl; + int level = 0; + int i, j, k; + + sched_domains_numa_scale = curr_distance; + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + if (!sched_domains_numa_distance) + return; + + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. + * + * Assumes node_distance(0,j) includes all distances in + * node_distance(i,j) in order to avoid cubic time. + * + * XXX: could be optimized to O(n log n) by using sort() + */ + next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { + for (j = 0; j < nr_node_ids; j++) { + int distance = node_distance(0, j); + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; + } + /* + * 'level' contains the number of unique distances, excluding the + * identity distance node_distance(i,i). + * + * The sched_domains_nume_distance[] array includes the actual distance + * numbers. + */ + + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + if (!sched_domains_numa_masks) + return; + + /* + * Now for each level, construct a mask per node which contains all + * cpus of nodes that are that many hops away from us. + */ + for (i = 0; i < level; i++) { + sched_domains_numa_masks[i] = + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!sched_domains_numa_masks[i]) + return; + + for (j = 0; j < nr_node_ids; j++) { + struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + if (!mask) + return; + + sched_domains_numa_masks[i][j] = mask; + + for (k = 0; k < nr_node_ids; k++) { + if (node_distance(cpu_to_node(j), k) > + sched_domains_numa_distance[i]) + continue; + + cpumask_or(mask, mask, cpumask_of_node(k)); + } + } + } + + tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; default_topology[i].init; i++) + tl[i] = default_topology[i]; + + /* + * .. and append 'j' levels of NUMA goodness. + */ + for (j = 0; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .init = sd_numa_init, + .mask = sd_numa_mask, + .flags = SDTL_OVERLAP, + .numa_level = j, + }; + } + + sched_domain_topology = tl; +} +#else +static inline void sched_init_numa(void) +{ +} +#endif /* CONFIG_NUMA */ + static int __sdt_alloc(const struct cpumask *cpu_map) { struct sched_domain_topology_level *tl; @@ -6840,6 +6928,8 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + sched_init_numa(); + get_online_cpus(); mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); -- cgit v1.2.3 From c75841a398d667d9968245b9519d93cedbfb4780 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:07 +0200 Subject: perf/x86-ibs: Fix update of period The last sw period was not correctly updated on overflow and thus led to wrong distribution of events. We always need to properly initialize data.period in struct perf_sample_data. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 8ff74d439041..c8f69bea6624 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -386,7 +386,21 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (!(*buf++ & perf_ibs->valid_mask)) return 0; + /* + * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not + * supported in all cpus. As this triggered an interrupt, we + * set the current count to the max count. + */ + config = ibs_data.regs[0]; + if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { + config &= ~IBS_OP_CUR_CNT; + config |= (config & IBS_OP_MAX_CNT) << 36; + } + + perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + if (event->attr.sample_type & PERF_SAMPLE_RAW) { ibs_data.caps = ibs_caps; size = 1; @@ -405,19 +419,6 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs = *iregs; /* XXX: update ip from ibs sample */ - /* - * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not - * supported in all cpus. As this triggered an interrupt, we - * set the current count to the max count. - */ - config = ibs_data.regs[0]; - if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { - config &= ~IBS_OP_CUR_CNT; - config |= (config & IBS_OP_MAX_CNT) << 36; - } - - perf_ibs_event_update(perf_ibs, event, config); - overflow = perf_ibs_set_period(perf_ibs, hwc, &config); reenable = !(overflow && perf_event_overflow(event, &data, ®s)); config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0); -- cgit v1.2.3 From fd0d000b2c34aa43d4e92dcf0dfaeda7e123008a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:08 +0200 Subject: perf: Pass last sampling period to perf_sample_data_init() We always need to pass the last sample period to perf_sample_data_init(), otherwise the event distribution will be wrong. Thus, modifiyng the function interface with the required period as argument. So basically a pattern like this: perf_sample_data_init(&data, ~0ULL); data.period = event->hw.last_period; will now be like that: perf_sample_data_init(&data, ~0ULL, event->hw.last_period); Avoids unininitialized data.period and simplifies code. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 3 +-- arch/arm/kernel/perf_event_v6.c | 4 +--- arch/arm/kernel/perf_event_v7.c | 4 +--- arch/arm/kernel/perf_event_xscale.c | 8 ++------ arch/mips/kernel/perf_event_mipsxx.c | 2 +- arch/powerpc/perf/core-book3s.c | 3 +-- arch/powerpc/perf/core-fsl-emb.c | 3 +-- arch/sparc/kernel/perf_event.c | 4 +--- arch/x86/kernel/cpu/perf_event.c | 4 +--- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 3 +-- arch/x86/kernel/cpu/perf_event_intel.c | 4 +--- arch/x86/kernel/cpu/perf_event_intel_ds.c | 6 ++---- arch/x86/kernel/cpu/perf_event_p4.c | 6 +++--- include/linux/perf_event.h | 5 ++++- kernel/events/core.c | 9 ++++----- 15 files changed, 25 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 0dae252f7a33..d821b17047e0 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -824,7 +824,6 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, idx = la_ptr; - perf_sample_data_init(&data, 0); for (j = 0; j < cpuc->n_events; j++) { if (cpuc->current_idx[j] == idx) break; @@ -848,7 +847,7 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr, hwc = &event->hw; alpha_perf_event_update(event, hwc, idx, alpha_pmu->pmc_max_period[idx]+1); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (alpha_perf_event_set_period(event, hwc, idx)) { if (perf_event_overflow(event, &data, regs)) { diff --git a/arch/arm/kernel/perf_event_v6.c b/arch/arm/kernel/perf_event_v6.c index b78af0cc6ef3..ab627a740fa3 100644 --- a/arch/arm/kernel/perf_event_v6.c +++ b/arch/arm/kernel/perf_event_v6.c @@ -489,8 +489,6 @@ armv6pmu_handle_irq(int irq_num, */ armv6_pmcr_write(pmcr); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -509,7 +507,7 @@ armv6pmu_handle_irq(int irq_num, hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index 00755d82e2f2..d3c536068162 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -1077,8 +1077,6 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) */ regs = get_irq_regs(); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -1097,7 +1095,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; diff --git a/arch/arm/kernel/perf_event_xscale.c b/arch/arm/kernel/perf_event_xscale.c index 71a21e6712f5..e34e7254e652 100644 --- a/arch/arm/kernel/perf_event_xscale.c +++ b/arch/arm/kernel/perf_event_xscale.c @@ -248,8 +248,6 @@ xscale1pmu_handle_irq(int irq_num, void *dev) regs = get_irq_regs(); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -263,7 +261,7 @@ xscale1pmu_handle_irq(int irq_num, void *dev) hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; @@ -588,8 +586,6 @@ xscale2pmu_handle_irq(int irq_num, void *dev) regs = get_irq_regs(); - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < cpu_pmu->num_events; ++idx) { struct perf_event *event = cpuc->events[idx]; @@ -603,7 +599,7 @@ xscale2pmu_handle_irq(int irq_num, void *dev) hwc = &event->hw; armpmu_event_update(event, hwc, idx); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!armpmu_event_set_period(event, hwc, idx)) continue; diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 811084f4e422..ab73fa2fb9b5 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -1325,7 +1325,7 @@ static int mipsxx_pmu_handle_shared_irq(void) regs = get_irq_regs(); - perf_sample_data_init(&data, 0); + perf_sample_data_init(&data, 0, 0); switch (counters) { #define HANDLE_COUNTER(n) \ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 02aee03e713c..8f84bcba18da 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1299,8 +1299,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (record) { struct perf_sample_data data; - perf_sample_data_init(&data, ~0ULL); - data.period = event->hw.last_period; + perf_sample_data_init(&data, ~0ULL, event->hw.last_period); if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); diff --git a/arch/powerpc/perf/core-fsl-emb.c b/arch/powerpc/perf/core-fsl-emb.c index 0a6d2a9d569c..106c53354675 100644 --- a/arch/powerpc/perf/core-fsl-emb.c +++ b/arch/powerpc/perf/core-fsl-emb.c @@ -613,8 +613,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (record) { struct perf_sample_data data; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (perf_event_overflow(event, &data, regs)) fsl_emb_pmu_stop(event, 0); diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 28559ce5eeb5..5713957dcb8a 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1296,8 +1296,6 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, regs = args->regs; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* If the PMU has the TOE IRQ enable bits, we need to do a @@ -1321,7 +1319,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, if (val & (1ULL << 31)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!sparc_perf_event_set_period(event, hwc, idx)) continue; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e33e9cf160eb..e049d6da0183 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1183,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1219,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index c8f69bea6624..2317228b5299 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -398,8 +398,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) } perf_ibs_event_update(perf_ibs, event, config); - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (event->attr.sample_type & PERF_SAMPLE_RAW) { ibs_data.caps = ibs_caps; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 26b3e2fef104..166546ec6aef 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 status; int handled; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); /* @@ -1082,7 +1080,7 @@ again: if (!intel_pmu_save_and_restart(event)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (has_branch_stack(event)) data.br_stack = &cpuc->lbr_stack; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f64df19e7dd..5a3edc27f6e5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs.ip = 0; /* @@ -564,8 +563,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, if (!intel_pmu_save_and_restart(event)) return; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); /* * We use the interrupt regs as a base because the PEBS record diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index a2dfacfd7103..47124a73dd73 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) handled += overflow; /* event overflow for sure */ - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) continue; + + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ddbb6a901f65..f32578634d9d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1132,11 +1132,14 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; }; -static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) +static inline void perf_sample_data_init(struct perf_sample_data *data, + u64 addr, u64 period) { + /* remaining struct members initialized in perf_prepare_sample() */ data->addr = addr; data->raw = NULL; data->br_stack = NULL; + data->period = period; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/kernel/events/core.c b/kernel/events/core.c index 9789a56b7d54..00c58df9f4e2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) if (rctx < 0) return; - perf_sample_data_init(&data, addr); + perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); @@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, .data = record, }; - perf_sample_data_init(&data, addr); + perf_sample_data_init(&data, addr, 0); data.raw = &raw; hlist_for_each_entry_rcu(event, node, head, hlist_entry) { @@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; - perf_sample_data_init(&sample, bp->attr.bp_addr); + perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); @@ -5344,8 +5344,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { -- cgit v1.2.3 From 7bf352384fda3f678a283928c6c5b2cd9da877e4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:09 +0200 Subject: perf/x86-ibs: Enable ibs op micro-ops counting mode Allow enabling ibs op micro-ops counting mode. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-4-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 2317228b5299..ebf169fe40ef 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -468,6 +468,8 @@ static __init int perf_event_ibs_init(void) return -ENODEV; /* ibs not supported by the cpu */ perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + if (ibs_caps & IBS_CAPS_OPCNT) + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); -- cgit v1.2.3 From 6accb9cf76080422d400a641d9068b6b2a2c216f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:10 +0200 Subject: perf/x86-ibs: Fix frequency profiling Fixing profiling at a fixed frequency, in this case the freq value and sample period was setup incorrectly. Since sampling periods are adjusted we also allow periods that have lower 4 bits set. Another fix is the setup of the hw counter: If we modify hwc->sample_period, we also need to update hwc->last_period and hwc->period_left. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-5-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index ebf169fe40ef..bc401bd9f14a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -162,9 +162,16 @@ static int perf_ibs_init(struct perf_event *event) if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ return -EINVAL; - if (hwc->sample_period & 0x0f) - /* lower 4 bits can not be set in ibs max cnt */ + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; } else { max_cnt = config & perf_ibs->cnt_mask; config &= ~perf_ibs->cnt_mask; @@ -175,6 +182,13 @@ static int perf_ibs_init(struct perf_event *event) if (!hwc->sample_period) return -EINVAL; + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + hwc->config_base = perf_ibs->msr; hwc->config = config; -- cgit v1.2.3 From d47e8238cd76f1ffa7c8cd30e08b8e9074fd597e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:11 +0200 Subject: perf/x86-ibs: Take instruction pointer from ibs sample Each IBS sample contains a linear address of the instruction that caused the sample to trigger. This address is more precise than the rip that was taken from the interrupt handler's stack. Update the rip with that address. We use this in the next patch to implement precise-event sampling on AMD systems using IBS. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-6-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 6 ++-- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 48 +++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8a3c75d824b7..4e40a64315c9 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -158,6 +158,7 @@ struct x86_pmu_capability { #define IBS_CAPS_OPCNT (1U<<4) #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) +#define IBS_CAPS_RIPINVALIDCHK (1U<<7) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -170,14 +171,14 @@ struct x86_pmu_capability { #define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) #define IBSCTL_LVT_OFFSET_MASK 0x0F -/* IbsFetchCtl bits/masks */ +/* ibs fetch bits/masks */ #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) #define IBS_FETCH_CNT 0xFFFF0000ULL #define IBS_FETCH_MAX_CNT 0x0000FFFFULL -/* IbsOpCtl bits */ +/* ibs op bits/masks */ /* lower 4 bits of the current count are ignored: */ #define IBS_OP_CUR_CNT (0xFFFF0ULL<<32) #define IBS_OP_CNT_CTL (1ULL<<19) @@ -185,6 +186,7 @@ struct x86_pmu_capability { #define IBS_OP_ENABLE (1ULL<<17) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ +#define IBS_RIP_INVALID (1ULL<<38) extern u32 get_ibs_caps(void); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index bc401bd9f14a..cc1f3293d6c2 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -382,7 +383,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_raw_record raw; struct pt_regs regs; struct perf_ibs_data ibs_data; - int offset, size, overflow, reenable; + int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; u64 *buf, config; @@ -413,28 +414,41 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &config)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) + offset_max = 2; + else + offset_max = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (!check_rip || !(ibs_data.regs[2] & IBS_RIP_INVALID)) + instruction_pointer_set(®s, ibs_data.regs[1]); if (event->attr.sample_type & PERF_SAMPLE_RAW) { - ibs_data.caps = ibs_caps; - size = 1; - offset = 1; - do { - rdmsrl(msr + offset, *buf++); - size++; - offset = find_next_bit(perf_ibs->offset_mask, - perf_ibs->offset_max, - offset + 1); - } while (offset < perf_ibs->offset_max); - raw.size = sizeof(u32) + sizeof(u64) * size; + raw.size = sizeof(u32) + ibs_data.size; raw.data = ibs_data.data; data.raw = &raw; } - regs = *iregs; /* XXX: update ip from ibs sample */ - - overflow = perf_ibs_set_period(perf_ibs, hwc, &config); - reenable = !(overflow && perf_event_overflow(event, &data, ®s)); - config = (config >> 4) | (reenable ? perf_ibs->enable_mask : 0); + throttle = perf_event_overflow(event, &data, ®s); +out: + config = (config >> 4) | (throttle ? 0 : perf_ibs->enable_mask); perf_ibs_enable_event(hwc, config); perf_event_update_userpage(event); -- cgit v1.2.3 From 450bbd493d436f9eadd1b7828158f37559f26674 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 12 Mar 2012 12:54:32 +0100 Subject: perf/x86-ibs: Precise event sampling with IBS for AMD CPUs This patch adds support for precise event sampling with IBS. There are two counting modes to count either cycles or micro-ops. If the corresponding performance counter events (hw events) are setup with the precise flag set, the request is redirected to the ibs pmu: perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count perf record -a -e r076:p ... # same as -e cpu-cycles:p perf record -a -e r0C1:p ... # use ibs op counting micro-ops Each ibs sample contains a linear address that points to the instruction that was causing the sample to trigger. With ibs we have skid 0. Thus, ibs supports precise levels 1 and 2. Samples are marked with the PERF_EFLAGS_EXACT flag set. In rare cases the rip is invalid when IBS was not able to record the rip correctly. Then the PERF_EFLAGS_EXACT flag is cleared and the rip is taken from pt_regs. V2: * don't drop samples in precise level 2 if rip is invalid, instead support the PERF_EFLAGS_EXACT flag Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120502103309.GP18810@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 7 ++- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 73 ++++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 589286f28877..65652265fffd 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event) static int amd_pmu_hw_config(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); + int ret; + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + ret = x86_pmu_hw_config(event); if (ret) return ret; diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cc1f3293d6c2..34dfa853f6df 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -145,17 +145,80 @@ static struct perf_ibs *get_ibs_pmu(int type) return NULL; } +/* + * Use IBS for precise event sampling: + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ + switch (event->attr.precise_ip) { + case 0: + return -ENOENT; + case 1: + case 2: + break; + default: + return -EOPNOTSUPP; + } + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; u64 max_cnt, config; + int ret; perf_ibs = get_ibs_pmu(event->attr.type); - if (!perf_ibs) + if (perf_ibs) { + config = event->attr.config; + } else { + perf_ibs = &perf_ibs_op; + ret = perf_ibs_precise_event(event, &config); + if (ret) + return ret; + } + + if (event->pmu != &perf_ibs->pmu) return -ENOENT; - config = event->attr.config; if (config & ~perf_ibs->config_mask) return -EINVAL; @@ -437,8 +500,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) ibs_data.size = sizeof(u64) * size; regs = *iregs; - if (!check_rip || !(ibs_data.regs[2] & IBS_RIP_INVALID)) + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { instruction_pointer_set(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw.size = sizeof(u32) + ibs_data.size; -- cgit v1.2.3 From 98112d2e957e0d348f06d8a40f2f720204a70b55 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:13 +0200 Subject: perf/x86-ibs: Rename some variables Simple patch that just renames some variables for better understanding. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-8-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 34dfa853f6df..29a1bffe1dfb 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -62,7 +62,7 @@ struct perf_ibs_data { }; static int -perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count) +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) { s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; @@ -91,7 +91,7 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *count) if (left > max) left = max; - *count = (u64)left; + *hw_period = (u64)left; return overflow; } @@ -262,13 +262,13 @@ static int perf_ibs_init(struct perf_event *event) static int perf_ibs_set_period(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 *period) { - int ret; + int overflow; /* ignore lower 4 bits in min count: */ - ret = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); local64_set(&hwc->prev_count, 0); - return ret; + return overflow; } static u64 get_ibs_fetch_count(u64 config) -- cgit v1.2.3 From fc006cf7cc7471e1bdf34e40111971e03622af6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:14 +0200 Subject: perf/x86-ibs: Trigger overflow if remaining period is too small There are cases where the remaining period is smaller than the minimal possible value. In this case the counter is restarted with the minimal period. This is of no use as the interrupt handler will trigger immediately again and most likely hits itself. This biases the results. So, if the remaining period is within the min range, we better do not restart the counter and instead trigger the overflow. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-9-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 29a1bffe1dfb..3e32908292a7 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -78,16 +78,13 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_perio overflow = 1; } - if (unlikely(left <= 0)) { + if (unlikely(left < (s64)min)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; overflow = 1; } - if (unlikely(left < min)) - left = min; - if (left > max) left = max; -- cgit v1.2.3 From 7caaf4d8241feecafb87919402b0a6dbb1b71d9e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:15 +0200 Subject: perf/x86-ibs: Extend hw period that triggers overflow If the last hw period is too short we might hit the irq handler which biases the results. Thus try to have a max last period that triggers the sw overflow. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-10-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 3e32908292a7..cb51a3e55870 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -85,8 +85,19 @@ perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_perio overflow = 1; } - if (left > max) - left = max; + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } *hw_period = (u64)left; -- cgit v1.2.3 From c9574fe0bdb9ac9a2698e02a712088ce8431e9f8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:16 +0200 Subject: perf/x86-ibs: Implement workaround for IBS erratum #420 When disabling ibs there might be the case where hardware continuously generates interrupts. This is described in erratum #420 (Instruction- Based Sampling Engine May Generate Interrupt that Cannot Be Cleared). To avoid this we must clear the counter mask first and then clear the enable bit. This patch implements this. See Revision Guide for AMD Family 10h Processors, Publication #41322. Note: We now keep track of the last read ibs config value which is then used to disable ibs. To update the config value we pass now a pointer to the functions reading it. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-11-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 62 ++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index cb51a3e55870..b14e71127c82 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -291,20 +291,36 @@ static u64 get_ibs_op_count(u64 config) static void perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, - u64 config) + u64 *config) { - u64 count = perf_ibs->get_count(config); + u64 count = perf_ibs->get_count(*config); while (!perf_event_try_update(event, count, 20)) { - rdmsrl(event->hw.config_base, config); - count = perf_ibs->get_count(config); + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); } } -/* Note: The enable mask must be encoded in the config argument. */ -static inline void perf_ibs_enable_event(struct hw_perf_event *hwc, u64 config) +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) { - wrmsrl(hwc->config_base, hwc->config | config); + wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); } /* @@ -318,7 +334,7 @@ static void perf_ibs_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 config; + u64 period; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -326,10 +342,9 @@ static void perf_ibs_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; - perf_ibs_set_period(perf_ibs, hwc, &config); - config = (config >> 4) | perf_ibs->enable_mask; + perf_ibs_set_period(perf_ibs, hwc, &period); set_bit(IBS_STARTED, pcpu->state); - perf_ibs_enable_event(hwc, config); + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); perf_event_update_userpage(event); } @@ -339,7 +354,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 val; + u64 config; int stopping; stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); @@ -347,12 +362,11 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (!stopping && (hwc->state & PERF_HES_UPTODATE)) return; - rdmsrl(hwc->config_base, val); + rdmsrl(hwc->config_base, config); if (stopping) { set_bit(IBS_STOPPING, pcpu->state); - val &= ~perf_ibs->enable_mask; - wrmsrl(hwc->config_base, val); + perf_ibs_disable_event(perf_ibs, hwc, config); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } @@ -360,7 +374,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; - perf_ibs_event_update(perf_ibs, event, val); + perf_ibs_event_update(perf_ibs, event, &config); hwc->state |= PERF_HES_UPTODATE; } @@ -456,7 +470,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; - u64 *buf, config; + u64 *buf, *config, period; if (!test_bit(IBS_STARTED, pcpu->state)) { /* Catch spurious interrupts after stopping IBS: */ @@ -477,15 +491,15 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) * supported in all cpus. As this triggered an interrupt, we * set the current count to the max count. */ - config = ibs_data.regs[0]; + config = &ibs_data.regs[0]; if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { - config &= ~IBS_OP_CUR_CNT; - config |= (config & IBS_OP_MAX_CNT) << 36; + *config &= ~IBS_OP_CUR_CNT; + *config |= (*config & IBS_OP_MAX_CNT) << 36; } perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0, hwc->last_period); - if (!perf_ibs_set_period(perf_ibs, hwc, &config)) + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) goto out; /* no sw counter overflow */ ibs_data.caps = ibs_caps; @@ -523,8 +537,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) throttle = perf_event_overflow(event, &data, ®s); out: - config = (config >> 4) | (throttle ? 0 : perf_ibs->enable_mask); - perf_ibs_enable_event(hwc, config); + if (throttle) + perf_ibs_disable_event(perf_ibs, hwc, *config); + else + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); perf_event_update_userpage(event); -- cgit v1.2.3 From fc5fb2b5e1874e5894e2ac503bfb744220db89a1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:17 +0200 Subject: perf/x86-ibs: Catch spurious interrupts after stopping IBS After disabling IBS there could be still incomming NMIs with samples that even have the valid bit cleared. Mark all this NMIs as handled to avoid spurious interrupt messages. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-12-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index b14e71127c82..5a9f95b5cc26 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -473,11 +473,13 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) u64 *buf, *config, period; if (!test_bit(IBS_STARTED, pcpu->state)) { - /* Catch spurious interrupts after stopping IBS: */ - if (!test_and_clear_bit(IBS_STOPPING, pcpu->state)) - return 0; - rdmsrl(perf_ibs->msr, *ibs_data.regs); - return (*ibs_data.regs & perf_ibs->valid_mask) ? 1 : 0; + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incomming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; } msr = hwc->config_base; -- cgit v1.2.3 From 8b1e13638d465863572c8207a5cfceeef0cf0441 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 2 Apr 2012 20:19:18 +0200 Subject: perf/x86-ibs: Fix usage of IBS op current count The value of IbsOpCurCnt rolls over when it reaches IbsOpMaxCnt. Thus, it is reset to zero by hardware. To get the correct count we need to add the max count to it in case we received an ibs sample (valid bit set). Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1333390758-10893-13-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 33 ++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 5a9f95b5cc26..da9bcdcd9856 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -286,7 +286,15 @@ static u64 get_ibs_fetch_count(u64 config) static u64 get_ibs_op_count(u64 config) { - return (config & IBS_OP_CUR_CNT) >> 32; + u64 count = 0; + + if (config & IBS_OP_VAL) + count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + + if (ibs_caps & IBS_CAPS_RDWROPCNT) + count += (config & IBS_OP_CUR_CNT) >> 32; + + return count; } static void @@ -295,7 +303,12 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, { u64 count = perf_ibs->get_count(*config); - while (!perf_event_try_update(event, count, 20)) { + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { rdmsrl(event->hw.config_base, *config); count = perf_ibs->get_count(*config); } @@ -374,6 +387,12 @@ static void perf_ibs_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + perf_ibs_event_update(perf_ibs, event, &config); hwc->state |= PERF_HES_UPTODATE; } @@ -488,17 +507,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (!(*buf++ & perf_ibs->valid_mask)) return 0; - /* - * Emulate IbsOpCurCnt in MSRC001_1033 (IbsOpCtl), not - * supported in all cpus. As this triggered an interrupt, we - * set the current count to the max count. - */ config = &ibs_data.regs[0]; - if (perf_ibs == &perf_ibs_op && !(ibs_caps & IBS_CAPS_RDWROPCNT)) { - *config &= ~IBS_OP_CUR_CNT; - *config |= (*config & IBS_OP_MAX_CNT) << 36; - } - perf_ibs_event_update(perf_ibs, event, config); perf_sample_data_init(&data, 0, hwc->last_period); if (!perf_ibs_set_period(perf_ibs, hwc, &period)) -- cgit v1.2.3 From 35bdd29095ad614c5fb4a934bfd4f57a94dfd395 Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Thu, 12 Apr 2012 10:48:44 +0200 Subject: mfd: Add driver for STA2X11 MFD block This also introduces to export a function that is in the base sta2x11 support patches. The header will increase with other prototypes and constants over time. Signed-off-by: Alessandro Rubini Acked-by: Giancarlo Asnaghi Cc: Alan Cox Signed-off-by: Samuel Ortiz --- arch/x86/include/asm/sta2x11.h | 12 ++ drivers/mfd/Kconfig | 5 + drivers/mfd/Makefile | 1 + drivers/mfd/sta2x11-mfd.c | 467 ++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/sta2x11-mfd.h | 324 ++++++++++++++++++++++++++++ 5 files changed, 809 insertions(+) create mode 100644 arch/x86/include/asm/sta2x11.h create mode 100644 drivers/mfd/sta2x11-mfd.c create mode 100644 include/linux/mfd/sta2x11-mfd.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h new file mode 100644 index 000000000000..e9d32df89ccc --- /dev/null +++ b/arch/x86/include/asm/sta2x11.h @@ -0,0 +1,12 @@ +/* + * Header file for STMicroelectronics ConneXt (STA2X11) IOHub + */ +#ifndef __ASM_STA2X11_H +#define __ASM_STA2X11_H + +#include + +/* This needs to be called from the MFD to configure its sub-devices */ +struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev); + +#endif /* __ASM_STA2X11_H */ diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index ef86a741b7e2..48eed22c65a5 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -906,6 +906,11 @@ config MFD_RC5T583 Additional drivers must be enabled in order to use the different functionality of the device. +config MFD_STA2X11 + bool "STA2X11 multi function device support" + depends on STA2X11 + select MFD_CORE + config MFD_ANATOP bool "Support for Freescale i.MX on-chip ANATOP controller" depends on SOC_IMX6Q diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 5dd6be7aa350..0dc55cbefa09 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_MFD_DAVINCI_VOICECODEC) += davinci_voicecodec.o obj-$(CONFIG_MFD_DM355EVM_MSP) += dm355evm_msp.o obj-$(CONFIG_MFD_TI_SSP) += ti-ssp.o +obj-$(CONFIG_MFD_STA2X11) += sta2x11-mfd.o obj-$(CONFIG_MFD_STMPE) += stmpe.o obj-$(CONFIG_STMPE_I2C) += stmpe-i2c.o obj-$(CONFIG_STMPE_SPI) += stmpe-spi.o diff --git a/drivers/mfd/sta2x11-mfd.c b/drivers/mfd/sta2x11-mfd.c new file mode 100644 index 000000000000..d31fed07aefb --- /dev/null +++ b/drivers/mfd/sta2x11-mfd.c @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2009-2011 Wind River Systems, Inc. + * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* This describes STA2X11 MFD chip for us, we may have several */ +struct sta2x11_mfd { + struct sta2x11_instance *instance; + spinlock_t lock; + struct list_head list; + void __iomem *sctl_regs; + void __iomem *apbreg_regs; +}; + +static LIST_HEAD(sta2x11_mfd_list); + +/* Three functions to act on the list */ +static struct sta2x11_mfd *sta2x11_mfd_find(struct pci_dev *pdev) +{ + struct sta2x11_instance *instance; + struct sta2x11_mfd *mfd; + + if (!pdev && !list_empty(&sta2x11_mfd_list)) { + pr_warning("%s: Unspecified device, " + "using first instance\n", __func__); + return list_entry(sta2x11_mfd_list.next, + struct sta2x11_mfd, list); + } + + instance = sta2x11_get_instance(pdev); + if (!instance) + return NULL; + list_for_each_entry(mfd, &sta2x11_mfd_list, list) { + if (mfd->instance == instance) + return mfd; + } + return NULL; +} + +static int __devinit sta2x11_mfd_add(struct pci_dev *pdev, gfp_t flags) +{ + struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev); + struct sta2x11_instance *instance; + + if (mfd) + return -EBUSY; + instance = sta2x11_get_instance(pdev); + if (!instance) + return -EINVAL; + mfd = kzalloc(sizeof(*mfd), flags); + if (!mfd) + return -ENOMEM; + INIT_LIST_HEAD(&mfd->list); + spin_lock_init(&mfd->lock); + mfd->instance = instance; + list_add(&mfd->list, &sta2x11_mfd_list); + return 0; +} + +static int __devexit mfd_remove(struct pci_dev *pdev) +{ + struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev); + + if (!mfd) + return -ENODEV; + list_del(&mfd->list); + kfree(mfd); + return 0; +} + +/* These two functions are exported and are not expected to fail */ +u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val) +{ + struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev); + u32 r; + unsigned long flags; + + if (!mfd) { + dev_warn(&pdev->dev, ": can't access sctl regs\n"); + return 0; + } + if (!mfd->sctl_regs) { + dev_warn(&pdev->dev, ": system ctl not initialized\n"); + return 0; + } + spin_lock_irqsave(&mfd->lock, flags); + r = readl(mfd->sctl_regs + reg); + r &= ~mask; + r |= val; + if (mask) + writel(r, mfd->sctl_regs + reg); + spin_unlock_irqrestore(&mfd->lock, flags); + return r; +} +EXPORT_SYMBOL(sta2x11_sctl_mask); + +u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val) +{ + struct sta2x11_mfd *mfd = sta2x11_mfd_find(pdev); + u32 r; + unsigned long flags; + + if (!mfd) { + dev_warn(&pdev->dev, ": can't access apb regs\n"); + return 0; + } + if (!mfd->apbreg_regs) { + dev_warn(&pdev->dev, ": apb bridge not initialized\n"); + return 0; + } + spin_lock_irqsave(&mfd->lock, flags); + r = readl(mfd->apbreg_regs + reg); + r &= ~mask; + r |= val; + if (mask) + writel(r, mfd->apbreg_regs + reg); + spin_unlock_irqrestore(&mfd->lock, flags); + return r; +} +EXPORT_SYMBOL(sta2x11_apbreg_mask); + +/* Two debugfs files, for our registers (FIXME: one instance only) */ +#define REG(regname) {.name = #regname, .offset = SCTL_ ## regname} +static struct debugfs_reg32 sta2x11_sctl_regs[] = { + REG(SCCTL), REG(ARMCFG), REG(SCPLLCTL), REG(SCPLLFCTRL), + REG(SCRESFRACT), REG(SCRESCTRL1), REG(SCRESXTRL2), REG(SCPEREN0), + REG(SCPEREN1), REG(SCPEREN2), REG(SCGRST), REG(SCPCIPMCR1), + REG(SCPCIPMCR2), REG(SCPCIPMSR1), REG(SCPCIPMSR2), REG(SCPCIPMSR3), + REG(SCINTREN), REG(SCRISR), REG(SCCLKSTAT0), REG(SCCLKSTAT1), + REG(SCCLKSTAT2), REG(SCRSTSTA), +}; +#undef REG + +static struct debugfs_regset32 sctl_regset = { + .regs = sta2x11_sctl_regs, + .nregs = ARRAY_SIZE(sta2x11_sctl_regs), +}; + +#define REG(regname) {.name = #regname, .offset = regname} +static struct debugfs_reg32 sta2x11_apbreg_regs[] = { + REG(APBREG_BSR), REG(APBREG_PAER), REG(APBREG_PWAC), REG(APBREG_PRAC), + REG(APBREG_PCG), REG(APBREG_PUR), REG(APBREG_EMU_PCG), +}; +#undef REG + +static struct debugfs_regset32 apbreg_regset = { + .regs = sta2x11_apbreg_regs, + .nregs = ARRAY_SIZE(sta2x11_apbreg_regs), +}; + +static struct dentry *sta2x11_sctl_debugfs; +static struct dentry *sta2x11_apbreg_debugfs; + +/* Probe for the two platform devices */ +static int sta2x11_sctl_probe(struct platform_device *dev) +{ + struct pci_dev **pdev; + struct sta2x11_mfd *mfd; + struct resource *res; + + pdev = dev->dev.platform_data; + mfd = sta2x11_mfd_find(*pdev); + if (!mfd) + return -ENODEV; + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) + return -ENOMEM; + + if (!request_mem_region(res->start, resource_size(res), + "sta2x11-sctl")) + return -EBUSY; + + mfd->sctl_regs = ioremap(res->start, resource_size(res)); + if (!mfd->sctl_regs) { + release_mem_region(res->start, resource_size(res)); + return -ENOMEM; + } + sctl_regset.base = mfd->sctl_regs; + sta2x11_sctl_debugfs = debugfs_create_regset32("sta2x11-sctl", + S_IFREG | S_IRUGO, + NULL, &sctl_regset); + return 0; +} + +static int sta2x11_apbreg_probe(struct platform_device *dev) +{ + struct pci_dev **pdev; + struct sta2x11_mfd *mfd; + struct resource *res; + + pdev = dev->dev.platform_data; + dev_dbg(&dev->dev, "%s: pdata is %p\n", __func__, pdev); + dev_dbg(&dev->dev, "%s: *pdata is %p\n", __func__, *pdev); + + mfd = sta2x11_mfd_find(*pdev); + if (!mfd) + return -ENODEV; + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) + return -ENOMEM; + + if (!request_mem_region(res->start, resource_size(res), + "sta2x11-apbreg")) + return -EBUSY; + + mfd->apbreg_regs = ioremap(res->start, resource_size(res)); + if (!mfd->apbreg_regs) { + release_mem_region(res->start, resource_size(res)); + return -ENOMEM; + } + dev_dbg(&dev->dev, "%s: regbase %p\n", __func__, mfd->apbreg_regs); + + apbreg_regset.base = mfd->apbreg_regs; + sta2x11_apbreg_debugfs = debugfs_create_regset32("sta2x11-apbreg", + S_IFREG | S_IRUGO, + NULL, &apbreg_regset); + return 0; +} + +/* The two platform drivers */ +static struct platform_driver sta2x11_sctl_platform_driver = { + .driver = { + .name = "sta2x11-sctl", + .owner = THIS_MODULE, + }, + .probe = sta2x11_sctl_probe, +}; + +static int __init sta2x11_sctl_init(void) +{ + pr_info("%s\n", __func__); + return platform_driver_register(&sta2x11_sctl_platform_driver); +} + +static struct platform_driver sta2x11_platform_driver = { + .driver = { + .name = "sta2x11-apbreg", + .owner = THIS_MODULE, + }, + .probe = sta2x11_apbreg_probe, +}; + +static int __init sta2x11_apbreg_init(void) +{ + pr_info("%s\n", __func__); + return platform_driver_register(&sta2x11_platform_driver); +} + +/* + * What follows is the PCI device that hosts the above two pdevs. + * Each logic block is 4kB and they are all consecutive: we use this info. + */ + +/* Bar 0 */ +enum bar0_cells { + STA2X11_GPIO_0 = 0, + STA2X11_GPIO_1, + STA2X11_GPIO_2, + STA2X11_GPIO_3, + STA2X11_SCTL, + STA2X11_SCR, + STA2X11_TIME, +}; +/* Bar 1 */ +enum bar1_cells { + STA2X11_APBREG = 0, +}; +#define CELL_4K(_name, _cell) { \ + .name = _name, \ + .start = _cell * 4096, .end = _cell * 4096 + 4095, \ + .flags = IORESOURCE_MEM, \ + } + +static const __devinitconst struct resource gpio_resources[] = { + { + .name = "sta2x11_gpio", /* 4 consecutive cells, 1 driver */ + .start = 0, + .end = (4 * 4096) - 1, + .flags = IORESOURCE_MEM, + } +}; +static const __devinitconst struct resource sctl_resources[] = { + CELL_4K("sta2x11-sctl", STA2X11_SCTL), +}; +static const __devinitconst struct resource scr_resources[] = { + CELL_4K("sta2x11-scr", STA2X11_SCR), +}; +static const __devinitconst struct resource time_resources[] = { + CELL_4K("sta2x11-time", STA2X11_TIME), +}; + +static const __devinitconst struct resource apbreg_resources[] = { + CELL_4K("sta2x11-apbreg", STA2X11_APBREG), +}; + +#define DEV(_name, _r) \ + { .name = _name, .num_resources = ARRAY_SIZE(_r), .resources = _r, } + +static __devinitdata struct mfd_cell sta2x11_mfd_bar0[] = { + DEV("sta2x11-gpio", gpio_resources), /* offset 0: we add pdata later */ + DEV("sta2x11-sctl", sctl_resources), + DEV("sta2x11-scr", scr_resources), + DEV("sta2x11-time", time_resources), +}; + +static __devinitdata struct mfd_cell sta2x11_mfd_bar1[] = { + DEV("sta2x11-apbreg", apbreg_resources), +}; + +static int sta2x11_mfd_suspend(struct pci_dev *pdev, pm_message_t state) +{ + pci_save_state(pdev); + pci_disable_device(pdev); + pci_set_power_state(pdev, pci_choose_state(pdev, state)); + + return 0; +} + +static int sta2x11_mfd_resume(struct pci_dev *pdev) +{ + int err; + + pci_set_power_state(pdev, 0); + err = pci_enable_device(pdev); + if (err) + return err; + pci_restore_state(pdev); + + return 0; +} + +static int __devinit sta2x11_mfd_probe(struct pci_dev *pdev, + const struct pci_device_id *pci_id) +{ + int err, i; + struct sta2x11_gpio_pdata *gpio_data; + + dev_info(&pdev->dev, "%s\n", __func__); + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "Can't enable device.\n"); + return err; + } + + err = pci_enable_msi(pdev); + if (err) + dev_info(&pdev->dev, "Enable msi failed\n"); + + /* Read gpio config data as pci device's platform data */ + gpio_data = dev_get_platdata(&pdev->dev); + if (!gpio_data) + dev_warn(&pdev->dev, "no gpio configuration\n"); + + dev_dbg(&pdev->dev, "%s, gpio_data = %p (%p)\n", __func__, + gpio_data, &gpio_data); + dev_dbg(&pdev->dev, "%s, pdev = %p (%p)\n", __func__, + pdev, &pdev); + + /* platform data is the pci device for all of them */ + for (i = 0; i < ARRAY_SIZE(sta2x11_mfd_bar0); i++) { + sta2x11_mfd_bar0[i].pdata_size = sizeof(pdev); + sta2x11_mfd_bar0[i].platform_data = &pdev; + } + sta2x11_mfd_bar1[0].pdata_size = sizeof(pdev); + sta2x11_mfd_bar1[0].platform_data = &pdev; + + /* Record this pdev before mfd_add_devices: their probe looks for it */ + sta2x11_mfd_add(pdev, GFP_ATOMIC); + + + err = mfd_add_devices(&pdev->dev, -1, + sta2x11_mfd_bar0, + ARRAY_SIZE(sta2x11_mfd_bar0), + &pdev->resource[0], + 0); + if (err) { + dev_err(&pdev->dev, "mfd_add_devices[0] failed: %d\n", err); + goto err_disable; + } + + err = mfd_add_devices(&pdev->dev, -1, + sta2x11_mfd_bar1, + ARRAY_SIZE(sta2x11_mfd_bar1), + &pdev->resource[1], + 0); + if (err) { + dev_err(&pdev->dev, "mfd_add_devices[1] failed: %d\n", err); + goto err_disable; + } + + return 0; + +err_disable: + mfd_remove_devices(&pdev->dev); + pci_disable_device(pdev); + pci_disable_msi(pdev); + return err; +} + +static DEFINE_PCI_DEVICE_TABLE(sta2x11_mfd_tbl) = { + {PCI_DEVICE(PCI_VENDOR_ID_STMICRO, PCI_DEVICE_ID_STMICRO_GPIO)}, + {0,}, +}; + +static struct pci_driver sta2x11_mfd_driver = { + .name = "sta2x11-mfd", + .id_table = sta2x11_mfd_tbl, + .probe = sta2x11_mfd_probe, + .suspend = sta2x11_mfd_suspend, + .resume = sta2x11_mfd_resume, +}; + +static int __init sta2x11_mfd_init(void) +{ + pr_info("%s\n", __func__); + return pci_register_driver(&sta2x11_mfd_driver); +} + +/* + * All of this must be ready before "normal" devices like MMCI appear. + * But MFD (the pci device) can't be too early. The following choice + * prepares platform drivers very early and probe the PCI device later, + * but before other PCI devices. + */ +subsys_initcall(sta2x11_apbreg_init); +subsys_initcall(sta2x11_sctl_init); +rootfs_initcall(sta2x11_mfd_init); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Wind River"); +MODULE_DESCRIPTION("STA2x11 mfd for GPIO, SCTL and APBREG"); +MODULE_DEVICE_TABLE(pci, sta2x11_mfd_tbl); diff --git a/include/linux/mfd/sta2x11-mfd.h b/include/linux/mfd/sta2x11-mfd.h new file mode 100644 index 000000000000..d179227e866f --- /dev/null +++ b/include/linux/mfd/sta2x11-mfd.h @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2009-2011 Wind River Systems, Inc. + * Copyright (c) 2011 ST Microelectronics (Alessandro Rubini) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * The STMicroelectronics ConneXt (STA2X11) chip has several unrelated + * functions in one PCI endpoint functions. This driver simply + * registers the platform devices in this iomemregion and exports a few + * functions to access common registers + */ + +#ifndef __STA2X11_MFD_H +#define __STA2X11_MFD_H +#include +#include + +/* + * The MFD PCI block includes the GPIO peripherals and other register blocks. + * For GPIO, we have 32*4 bits (I use "gsta" for "gpio sta2x11".) + */ +#define GSTA_GPIO_PER_BLOCK 32 +#define GSTA_NR_BLOCKS 4 +#define GSTA_NR_GPIO (GSTA_GPIO_PER_BLOCK * GSTA_NR_BLOCKS) + +/* Pinconfig is set by the board definition: altfunc, pull-up, pull-down */ +struct sta2x11_gpio_pdata { + unsigned pinconfig[GSTA_NR_GPIO]; +}; + +/* Macros below lifted from sh_pfc.h, with minor differences */ +#define PINMUX_TYPE_NONE 0 +#define PINMUX_TYPE_FUNCTION 1 +#define PINMUX_TYPE_OUTPUT_LOW 2 +#define PINMUX_TYPE_OUTPUT_HIGH 3 +#define PINMUX_TYPE_INPUT 4 +#define PINMUX_TYPE_INPUT_PULLUP 5 +#define PINMUX_TYPE_INPUT_PULLDOWN 6 + +/* Give names to GPIO pins, like PXA does, taken from the manual */ +#define STA2X11_GPIO0 0 +#define STA2X11_GPIO1 1 +#define STA2X11_GPIO2 2 +#define STA2X11_GPIO3 3 +#define STA2X11_GPIO4 4 +#define STA2X11_GPIO5 5 +#define STA2X11_GPIO6 6 +#define STA2X11_GPIO7 7 +#define STA2X11_GPIO8_RGBOUT_RED7 8 +#define STA2X11_GPIO9_RGBOUT_RED6 9 +#define STA2X11_GPIO10_RGBOUT_RED5 10 +#define STA2X11_GPIO11_RGBOUT_RED4 11 +#define STA2X11_GPIO12_RGBOUT_RED3 12 +#define STA2X11_GPIO13_RGBOUT_RED2 13 +#define STA2X11_GPIO14_RGBOUT_RED1 14 +#define STA2X11_GPIO15_RGBOUT_RED0 15 +#define STA2X11_GPIO16_RGBOUT_GREEN7 16 +#define STA2X11_GPIO17_RGBOUT_GREEN6 17 +#define STA2X11_GPIO18_RGBOUT_GREEN5 18 +#define STA2X11_GPIO19_RGBOUT_GREEN4 19 +#define STA2X11_GPIO20_RGBOUT_GREEN3 20 +#define STA2X11_GPIO21_RGBOUT_GREEN2 21 +#define STA2X11_GPIO22_RGBOUT_GREEN1 22 +#define STA2X11_GPIO23_RGBOUT_GREEN0 23 +#define STA2X11_GPIO24_RGBOUT_BLUE7 24 +#define STA2X11_GPIO25_RGBOUT_BLUE6 25 +#define STA2X11_GPIO26_RGBOUT_BLUE5 26 +#define STA2X11_GPIO27_RGBOUT_BLUE4 27 +#define STA2X11_GPIO28_RGBOUT_BLUE3 28 +#define STA2X11_GPIO29_RGBOUT_BLUE2 29 +#define STA2X11_GPIO30_RGBOUT_BLUE1 30 +#define STA2X11_GPIO31_RGBOUT_BLUE0 31 +#define STA2X11_GPIO32_RGBOUT_VSYNCH 32 +#define STA2X11_GPIO33_RGBOUT_HSYNCH 33 +#define STA2X11_GPIO34_RGBOUT_DEN 34 +#define STA2X11_GPIO35_ETH_CRS_DV 35 +#define STA2X11_GPIO36_ETH_TXD1 36 +#define STA2X11_GPIO37_ETH_TXD0 37 +#define STA2X11_GPIO38_ETH_TX_EN 38 +#define STA2X11_GPIO39_MDIO 39 +#define STA2X11_GPIO40_ETH_REF_CLK 40 +#define STA2X11_GPIO41_ETH_RXD1 41 +#define STA2X11_GPIO42_ETH_RXD0 42 +#define STA2X11_GPIO43_MDC 43 +#define STA2X11_GPIO44_CAN_TX 44 +#define STA2X11_GPIO45_CAN_RX 45 +#define STA2X11_GPIO46_MLB_DAT 46 +#define STA2X11_GPIO47_MLB_SIG 47 +#define STA2X11_GPIO48_SPI0_CLK 48 +#define STA2X11_GPIO49_SPI0_TXD 49 +#define STA2X11_GPIO50_SPI0_RXD 50 +#define STA2X11_GPIO51_SPI0_FRM 51 +#define STA2X11_GPIO52_SPI1_CLK 52 +#define STA2X11_GPIO53_SPI1_TXD 53 +#define STA2X11_GPIO54_SPI1_RXD 54 +#define STA2X11_GPIO55_SPI1_FRM 55 +#define STA2X11_GPIO56_SPI2_CLK 56 +#define STA2X11_GPIO57_SPI2_TXD 57 +#define STA2X11_GPIO58_SPI2_RXD 58 +#define STA2X11_GPIO59_SPI2_FRM 59 +#define STA2X11_GPIO60_I2C0_SCL 60 +#define STA2X11_GPIO61_I2C0_SDA 61 +#define STA2X11_GPIO62_I2C1_SCL 62 +#define STA2X11_GPIO63_I2C1_SDA 63 +#define STA2X11_GPIO64_I2C2_SCL 64 +#define STA2X11_GPIO65_I2C2_SDA 65 +#define STA2X11_GPIO66_I2C3_SCL 66 +#define STA2X11_GPIO67_I2C3_SDA 67 +#define STA2X11_GPIO68_MSP0_RCK 68 +#define STA2X11_GPIO69_MSP0_RXD 69 +#define STA2X11_GPIO70_MSP0_RFS 70 +#define STA2X11_GPIO71_MSP0_TCK 71 +#define STA2X11_GPIO72_MSP0_TXD 72 +#define STA2X11_GPIO73_MSP0_TFS 73 +#define STA2X11_GPIO74_MSP0_SCK 74 +#define STA2X11_GPIO75_MSP1_CK 75 +#define STA2X11_GPIO76_MSP1_RXD 76 +#define STA2X11_GPIO77_MSP1_FS 77 +#define STA2X11_GPIO78_MSP1_TXD 78 +#define STA2X11_GPIO79_MSP2_CK 79 +#define STA2X11_GPIO80_MSP2_RXD 80 +#define STA2X11_GPIO81_MSP2_FS 81 +#define STA2X11_GPIO82_MSP2_TXD 82 +#define STA2X11_GPIO83_MSP3_CK 83 +#define STA2X11_GPIO84_MSP3_RXD 84 +#define STA2X11_GPIO85_MSP3_FS 85 +#define STA2X11_GPIO86_MSP3_TXD 86 +#define STA2X11_GPIO87_MSP4_CK 87 +#define STA2X11_GPIO88_MSP4_RXD 88 +#define STA2X11_GPIO89_MSP4_FS 89 +#define STA2X11_GPIO90_MSP4_TXD 90 +#define STA2X11_GPIO91_MSP5_CK 91 +#define STA2X11_GPIO92_MSP5_RXD 92 +#define STA2X11_GPIO93_MSP5_FS 93 +#define STA2X11_GPIO94_MSP5_TXD 94 +#define STA2X11_GPIO95_SDIO3_DAT3 95 +#define STA2X11_GPIO96_SDIO3_DAT2 96 +#define STA2X11_GPIO97_SDIO3_DAT1 97 +#define STA2X11_GPIO98_SDIO3_DAT0 98 +#define STA2X11_GPIO99_SDIO3_CLK 99 +#define STA2X11_GPIO100_SDIO3_CMD 100 +#define STA2X11_GPIO101 101 +#define STA2X11_GPIO102 102 +#define STA2X11_GPIO103 103 +#define STA2X11_GPIO104 104 +#define STA2X11_GPIO105_SDIO2_DAT3 105 +#define STA2X11_GPIO106_SDIO2_DAT2 106 +#define STA2X11_GPIO107_SDIO2_DAT1 107 +#define STA2X11_GPIO108_SDIO2_DAT0 108 +#define STA2X11_GPIO109_SDIO2_CLK 109 +#define STA2X11_GPIO110_SDIO2_CMD 110 +#define STA2X11_GPIO111 111 +#define STA2X11_GPIO112 112 +#define STA2X11_GPIO113 113 +#define STA2X11_GPIO114 114 +#define STA2X11_GPIO115_SDIO1_DAT3 115 +#define STA2X11_GPIO116_SDIO1_DAT2 116 +#define STA2X11_GPIO117_SDIO1_DAT1 117 +#define STA2X11_GPIO118_SDIO1_DAT0 118 +#define STA2X11_GPIO119_SDIO1_CLK 119 +#define STA2X11_GPIO120_SDIO1_CMD 120 +#define STA2X11_GPIO121 121 +#define STA2X11_GPIO122 122 +#define STA2X11_GPIO123 123 +#define STA2X11_GPIO124 124 +#define STA2X11_GPIO125_UART2_TXD 125 +#define STA2X11_GPIO126_UART2_RXD 126 +#define STA2X11_GPIO127_UART3_TXD 127 + +/* + * The APB bridge has its own registers, needed by our users as well. + * They are accessed with the following read/mask/write function. + */ +u32 sta2x11_apbreg_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val); + +/* CAN and MLB */ +#define APBREG_BSR 0x00 /* Bridge Status Reg */ +#define APBREG_PAER 0x08 /* Peripherals Address Error Reg */ +#define APBREG_PWAC 0x20 /* Peripheral Write Access Control reg */ +#define APBREG_PRAC 0x40 /* Peripheral Read Access Control reg */ +#define APBREG_PCG 0x60 /* Peripheral Clock Gating Reg */ +#define APBREG_PUR 0x80 /* Peripheral Under Reset Reg */ +#define APBREG_EMU_PCG 0xA0 /* Emulator Peripheral Clock Gating Reg */ + +#define APBREG_CAN (1 << 1) +#define APBREG_MLB (1 << 3) + +/* SARAC */ +#define APBREG_BSR_SARAC 0x100 /* Bridge Status Reg */ +#define APBREG_PAER_SARAC 0x108 /* Peripherals Address Error Reg */ +#define APBREG_PWAC_SARAC 0x120 /* Peripheral Write Access Control reg */ +#define APBREG_PRAC_SARAC 0x140 /* Peripheral Read Access Control reg */ +#define APBREG_PCG_SARAC 0x160 /* Peripheral Clock Gating Reg */ +#define APBREG_PUR_SARAC 0x180 /* Peripheral Under Reset Reg */ +#define APBREG_EMU_PCG_SARAC 0x1A0 /* Emulator Peripheral Clock Gating Reg */ + +#define APBREG_SARAC (1 << 2) + +/* + * The system controller has its own registers. Some of these are accessed + * by out users as well, using the following read/mask/write/function + */ +u32 sta2x11_sctl_mask(struct pci_dev *pdev, u32 reg, u32 mask, u32 val); + +#define SCTL_SCCTL 0x00 /* System controller control register */ +#define SCTL_ARMCFG 0x04 /* ARM configuration register */ +#define SCTL_SCPLLCTL 0x08 /* PLL control status register */ +#define SCTL_SCPLLFCTRL 0x0c /* PLL frequency control register */ +#define SCTL_SCRESFRACT 0x10 /* PLL fractional input register */ +#define SCTL_SCRESCTRL1 0x14 /* Peripheral reset control 1 */ +#define SCTL_SCRESXTRL2 0x18 /* Peripheral reset control 2 */ +#define SCTL_SCPEREN0 0x1c /* Peripheral clock enable register 0 */ +#define SCTL_SCPEREN1 0x20 /* Peripheral clock enable register 1 */ +#define SCTL_SCPEREN2 0x24 /* Peripheral clock enable register 2 */ +#define SCTL_SCGRST 0x28 /* Peripheral global reset */ +#define SCTL_SCPCIPMCR1 0x30 /* PCI power management control 1 */ +#define SCTL_SCPCIPMCR2 0x34 /* PCI power management control 2 */ +#define SCTL_SCPCIPMSR1 0x38 /* PCI power management status 1 */ +#define SCTL_SCPCIPMSR2 0x3c /* PCI power management status 2 */ +#define SCTL_SCPCIPMSR3 0x40 /* PCI power management status 3 */ +#define SCTL_SCINTREN 0x44 /* Interrupt enable */ +#define SCTL_SCRISR 0x48 /* RAW interrupt status */ +#define SCTL_SCCLKSTAT0 0x4c /* Peripheral clocks status 0 */ +#define SCTL_SCCLKSTAT1 0x50 /* Peripheral clocks status 1 */ +#define SCTL_SCCLKSTAT2 0x54 /* Peripheral clocks status 2 */ +#define SCTL_SCRSTSTA 0x58 /* Reset status register */ + +#define SCTL_SCRESCTRL1_USB_PHY_POR (1 << 0) +#define SCTL_SCRESCTRL1_USB_OTG (1 << 1) +#define SCTL_SCRESCTRL1_USB_HRST (1 << 2) +#define SCTL_SCRESCTRL1_USB_PHY_HOST (1 << 3) +#define SCTL_SCRESCTRL1_SATAII (1 << 4) +#define SCTL_SCRESCTRL1_VIP (1 << 5) +#define SCTL_SCRESCTRL1_PER_MMC0 (1 << 6) +#define SCTL_SCRESCTRL1_PER_MMC1 (1 << 7) +#define SCTL_SCRESCTRL1_PER_GPIO0 (1 << 8) +#define SCTL_SCRESCTRL1_PER_GPIO1 (1 << 9) +#define SCTL_SCRESCTRL1_PER_GPIO2 (1 << 10) +#define SCTL_SCRESCTRL1_PER_GPIO3 (1 << 11) +#define SCTL_SCRESCTRL1_PER_MTU0 (1 << 12) +#define SCTL_SCRESCTRL1_KER_SPI0 (1 << 13) +#define SCTL_SCRESCTRL1_KER_SPI1 (1 << 14) +#define SCTL_SCRESCTRL1_KER_SPI2 (1 << 15) +#define SCTL_SCRESCTRL1_KER_MCI0 (1 << 16) +#define SCTL_SCRESCTRL1_KER_MCI1 (1 << 17) +#define SCTL_SCRESCTRL1_PRE_HSI2C0 (1 << 18) +#define SCTL_SCRESCTRL1_PER_HSI2C1 (1 << 19) +#define SCTL_SCRESCTRL1_PER_HSI2C2 (1 << 20) +#define SCTL_SCRESCTRL1_PER_HSI2C3 (1 << 21) +#define SCTL_SCRESCTRL1_PER_MSP0 (1 << 22) +#define SCTL_SCRESCTRL1_PER_MSP1 (1 << 23) +#define SCTL_SCRESCTRL1_PER_MSP2 (1 << 24) +#define SCTL_SCRESCTRL1_PER_MSP3 (1 << 25) +#define SCTL_SCRESCTRL1_PER_MSP4 (1 << 26) +#define SCTL_SCRESCTRL1_PER_MSP5 (1 << 27) +#define SCTL_SCRESCTRL1_PER_MMC (1 << 28) +#define SCTL_SCRESCTRL1_KER_MSP0 (1 << 29) +#define SCTL_SCRESCTRL1_KER_MSP1 (1 << 30) +#define SCTL_SCRESCTRL1_KER_MSP2 (1 << 31) + +#define SCTL_SCPEREN0_UART0 (1 << 0) +#define SCTL_SCPEREN0_UART1 (1 << 1) +#define SCTL_SCPEREN0_UART2 (1 << 2) +#define SCTL_SCPEREN0_UART3 (1 << 3) +#define SCTL_SCPEREN0_MSP0 (1 << 4) +#define SCTL_SCPEREN0_MSP1 (1 << 5) +#define SCTL_SCPEREN0_MSP2 (1 << 6) +#define SCTL_SCPEREN0_MSP3 (1 << 7) +#define SCTL_SCPEREN0_MSP4 (1 << 8) +#define SCTL_SCPEREN0_MSP5 (1 << 9) +#define SCTL_SCPEREN0_SPI0 (1 << 10) +#define SCTL_SCPEREN0_SPI1 (1 << 11) +#define SCTL_SCPEREN0_SPI2 (1 << 12) +#define SCTL_SCPEREN0_I2C0 (1 << 13) +#define SCTL_SCPEREN0_I2C1 (1 << 14) +#define SCTL_SCPEREN0_I2C2 (1 << 15) +#define SCTL_SCPEREN0_I2C3 (1 << 16) +#define SCTL_SCPEREN0_SVDO_LVDS (1 << 17) +#define SCTL_SCPEREN0_USB_HOST (1 << 18) +#define SCTL_SCPEREN0_USB_OTG (1 << 19) +#define SCTL_SCPEREN0_MCI0 (1 << 20) +#define SCTL_SCPEREN0_MCI1 (1 << 21) +#define SCTL_SCPEREN0_MCI2 (1 << 22) +#define SCTL_SCPEREN0_MCI3 (1 << 23) +#define SCTL_SCPEREN0_SATA (1 << 24) +#define SCTL_SCPEREN0_ETHERNET (1 << 25) +#define SCTL_SCPEREN0_VIC (1 << 26) +#define SCTL_SCPEREN0_DMA_AUDIO (1 << 27) +#define SCTL_SCPEREN0_DMA_SOC (1 << 28) +#define SCTL_SCPEREN0_RAM (1 << 29) +#define SCTL_SCPEREN0_VIP (1 << 30) +#define SCTL_SCPEREN0_ARM (1 << 31) + +#define SCTL_SCPEREN1_UART0 (1 << 0) +#define SCTL_SCPEREN1_UART1 (1 << 1) +#define SCTL_SCPEREN1_UART2 (1 << 2) +#define SCTL_SCPEREN1_UART3 (1 << 3) +#define SCTL_SCPEREN1_MSP0 (1 << 4) +#define SCTL_SCPEREN1_MSP1 (1 << 5) +#define SCTL_SCPEREN1_MSP2 (1 << 6) +#define SCTL_SCPEREN1_MSP3 (1 << 7) +#define SCTL_SCPEREN1_MSP4 (1 << 8) +#define SCTL_SCPEREN1_MSP5 (1 << 9) +#define SCTL_SCPEREN1_SPI0 (1 << 10) +#define SCTL_SCPEREN1_SPI1 (1 << 11) +#define SCTL_SCPEREN1_SPI2 (1 << 12) +#define SCTL_SCPEREN1_I2C0 (1 << 13) +#define SCTL_SCPEREN1_I2C1 (1 << 14) +#define SCTL_SCPEREN1_I2C2 (1 << 15) +#define SCTL_SCPEREN1_I2C3 (1 << 16) +#define SCTL_SCPEREN1_USB_PHY (1 << 17) + +#endif /* __STA2X11_MFD_H */ -- cgit v1.2.3 From c5403aed044e23f8d1ecdf05d0ff120314186527 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 9 May 2012 23:25:06 +0300 Subject: x86, realmode: build fix: remove duplicate build Real-mode binary was built twice. This patch fixes the issue by making realmode.relocs as target for realmode.bin. [ hpa: removed the direct dependency on realmode.relocs in arch/x86/realmode/Makefile ] Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336595106-21135-1-git-send-email-jarkko.sakkinen@intel.com Cc: Sam Ravnborg Cc: Michal Marek Signed-off-by: H. Peter Anvin --- arch/x86/realmode/Makefile | 5 +---- arch/x86/realmode/rm/Makefile | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index f22a4f8d99d6..a05b3aca64ad 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -11,10 +11,7 @@ subdir- := rm obj-y += rmpiggy.o -$(obj)/rmpiggy.o: $(obj)/rm/realmode.relocs $(obj)/rm/realmode.bin +$(obj)/rmpiggy.o: $(obj)/rm/realmode.bin $(obj)/rm/realmode.bin: FORCE $(Q)$(MAKE) $(build)=$(obj)/rm $@ - -$(obj)/rm/realmode.relocs: FORCE - $(Q)$(MAKE) $(build)=$(obj)/rm $@ diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index de40bc44b92f..1c1d3d3bbee4 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -48,7 +48,7 @@ $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE OBJCOPYFLAGS_realmode.bin := -O binary -$(obj)/realmode.bin: $(obj)/realmode.elf +$(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs $(call if_changed,objcopy) quiet_cmd_relocs = RELOCS $@ -- cgit v1.2.3 From 0f6f11eb00830fa691c16084048f53d83c5c3a5d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 May 2012 14:53:01 -0700 Subject: x86, realmode: Make sure all generated files are listed in targets Kbuild expects all generated files to be listed in the targets variable. If it isn't, weird things happen. Cc: Sam Ravnborg Cc: Michal Marek Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336595106-21135-1-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/realmode/rm/Makefile | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 1c1d3d3bbee4..5b84a2d30888 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -7,13 +7,7 @@ # # -always := realmode.bin - -realmode-y += header.o -realmode-y += trampoline_$(BITS).o -realmode-y += stack.o -realmode-$(CONFIG_X86_32) += reboot_32.o -realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) +always := realmode.bin realmode.relocs wakeup-objs := wakeup_asm.o wakemain.o video-mode.o wakeup-objs += copy.o bioscall.o regs.o @@ -25,6 +19,12 @@ wakeup-objs += video-vga.o wakeup-objs += video-vesa.o wakeup-objs += video-bios.o +realmode-y += header.o +realmode-y += trampoline_$(BITS).o +realmode-y += stack.o +realmode-$(CONFIG_X86_32) += reboot_32.o +realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) + targets += $(realmode-y) REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y)) @@ -35,24 +35,30 @@ quiet_cmd_pasyms = PASYMS $@ cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \ sed $(sed-pasyms) | sort | uniq > $@ +targets += pasyms.h $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE $(call if_changed,pasyms) +targets += realmode.lds $(obj)/realmode.lds: $(obj)/pasyms.h LDFLAGS_realmode.elf := --emit-relocs -T CPPFLAGS_realmode.lds += -P -C -I$(obj) +targets += realmode.elf $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE $(call if_changed,ld) OBJCOPYFLAGS_realmode.bin := -O binary +targets += realmode.bin $(obj)/realmode.bin: $(obj)/realmode.elf $(obj)/realmode.relocs $(call if_changed,objcopy) quiet_cmd_relocs = RELOCS $@ cmd_relocs = arch/x86/tools/relocs --realmode $< > $@ + +targets += realmode.relocs $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE $(call if_changed,relocs) -- cgit v1.2.3 From 34d0b02e08470c56a411ba6da1f377bc6da02826 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 10 May 2012 10:11:38 +0300 Subject: x86, realmode: Fix no cache bits test in reboot_32.S Before the new real-mode code infrastructure %edx was used for testing CD and NW bits with andl in order to decide whether to flush the processor caches or not. The value of cr0 was also stored in %eax, which was later used to set cr0 after masking out lower byte (except TS bit) in order to enter real-mode. In the new real-mode code infrastructure we wanted to keep input parameter in %eax so we are using %edx for both cr0 cases. This has caused regression since andl overwrites the value of %edx. This patch fixes the issue by replacing andl with testl, which is essentially andl without writing result to the register. Special thanks to Paolo Bonzini for noting this and proposing a fix. Reported-and-tested-by: Paolo Bonzini Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1336633898-23743-1-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/realmode/rm/reboot_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot_32.S index 8d9bfd13a93e..114044876b3d 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot_32.S @@ -76,7 +76,7 @@ machine_real_restart_asm16: movl %edx, %cr0 movl %ecx, %cr3 movl %cr0, %edx - andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + testl $0x60000000, %edx /* If no cache bits -> no wbinvd */ jz 2f wbinvd 2: -- cgit v1.2.3 From 7563bbf89d065a2c3f05059ecbcc805645edcc62 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sun, 15 Apr 2012 10:52:54 +0100 Subject: gpiolib/arches: Centralise bolierplate asm/gpio.h Rather than requiring architectures that use gpiolib but don't have any need to define anything custom to copy an asm/gpio.h provide a Kconfig symbol which architectures must select in order to include gpio.h and for other architectures just provide the trivial implementation directly. This makes it much easier to do gpiolib updates and is also a step towards making gpiolib APIs available on every architecture. For architectures with existing boilerplate code leave a stub header in place which warns on direct inclusion of asm/gpio.h and includes linux/gpio.h to catch code that's doing this. Direct inclusion of asm/gpio.h has long been deprecated. Signed-off-by: Mark Brown Acked-by: Jonas Bonn Acked-by: Tony Luck Acked-by: Linus Walleij Signed-off-by: Grant Likely --- arch/alpha/include/asm/gpio.h | 59 +++----------------------------- arch/arm/Kconfig | 1 + arch/avr32/Kconfig | 1 + arch/blackfin/Kconfig | 1 + arch/ia64/include/asm/gpio.h | 59 +++----------------------------- arch/m68k/Kconfig.cpu | 1 + arch/microblaze/include/asm/gpio.h | 57 +++---------------------------- arch/mips/Kconfig | 1 + arch/openrisc/include/asm/gpio.h | 69 +++----------------------------------- arch/powerpc/include/asm/gpio.h | 57 +++---------------------------- arch/sh/Kconfig | 1 + arch/sparc/include/asm/gpio.h | 40 +++------------------- arch/unicore32/Kconfig | 1 + arch/x86/include/asm/gpio.h | 57 +++---------------------------- arch/xtensa/include/asm/gpio.h | 60 +++------------------------------ drivers/gpio/Kconfig | 8 +++++ include/linux/gpio.h | 34 +++++++++++++++++++ 17 files changed, 81 insertions(+), 426 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/gpio.h b/arch/alpha/include/asm/gpio.h index 7dc6a6343c06..b3799d88ffcf 100644 --- a/arch/alpha/include/asm/gpio.h +++ b/arch/alpha/include/asm/gpio.h @@ -1,55 +1,4 @@ -/* - * Generic GPIO API implementation for Alpha. - * - * A stright copy of that for PowerPC which was: - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef _ASM_ALPHA_GPIO_H -#define _ASM_ALPHA_GPIO_H - -#include -#include - -#ifdef CONFIG_GPIOLIB - -/* - * We don't (yet) implement inlined/rapid versions for on-chip gpios. - * Just call gpiolib. - */ -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return __gpio_to_irq(gpio); -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* _ASM_ALPHA_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 36586dba6fa6..777025e773d9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1,6 +1,7 @@ config ARM bool default y + select ARCH_HAVE_CUSTOM_GPIO_H select HAVE_AOUT select HAVE_DMA_API_DEBUG select HAVE_IDE if PCI || ISA || PCMCIA diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 3dea7231f637..859b2de4a624 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -11,6 +11,7 @@ config AVR32 select GENERIC_ATOMIC64 select HARDIRQS_SW_RESEND select GENERIC_IRQ_SHOW + select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG help AVR32 is a high-performance 32-bit RISC microprocessor core, diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 373a6902d8fa..bf3d80f9738b 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -31,6 +31,7 @@ config BLACKFIN select HAVE_KERNEL_LZO if RAMKERNEL select HAVE_OPROFILE select HAVE_PERF_EVENTS + select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_GENERIC_HARDIRQS select GENERIC_ATOMIC64 diff --git a/arch/ia64/include/asm/gpio.h b/arch/ia64/include/asm/gpio.h index 590a20debc4e..b3799d88ffcf 100644 --- a/arch/ia64/include/asm/gpio.h +++ b/arch/ia64/include/asm/gpio.h @@ -1,55 +1,4 @@ -/* - * Generic GPIO API implementation for IA-64. - * - * A stright copy of that for PowerPC which was: - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef _ASM_IA64_GPIO_H -#define _ASM_IA64_GPIO_H - -#include -#include - -#ifdef CONFIG_GPIOLIB - -/* - * We don't (yet) implement inlined/rapid versions for on-chip gpios. - * Just call gpiolib. - */ -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return __gpio_to_irq(gpio); -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* _ASM_IA64_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 8a9c767125a4..8941af1d3ad2 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -24,6 +24,7 @@ config COLDFIRE bool "Coldfire CPU family support" select GENERIC_GPIO select ARCH_REQUIRE_GPIOLIB + select ARCH_HAVE_CUSTOM_GPIO_H select CPU_HAS_NO_BITFIELDS select CPU_HAS_NO_MULDIV64 select GENERIC_CSUM diff --git a/arch/microblaze/include/asm/gpio.h b/arch/microblaze/include/asm/gpio.h index 2b2c18be71c6..b3799d88ffcf 100644 --- a/arch/microblaze/include/asm/gpio.h +++ b/arch/microblaze/include/asm/gpio.h @@ -1,53 +1,4 @@ -/* - * Generic GPIO API implementation for PowerPC. - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef _ASM_MICROBLAZE_GPIO_H -#define _ASM_MICROBLAZE_GPIO_H - -#include -#include - -#ifdef CONFIG_GPIOLIB - -/* - * We don't (yet) implement inlined/rapid versions for on-chip gpios. - * Just call gpiolib. - */ -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return __gpio_to_irq(gpio); -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* _ASM_MICROBLAZE_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ce30e2f91d77..63321b283fe4 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -8,6 +8,7 @@ config MIPS select HAVE_PERF_EVENTS select PERF_USE_VMALLOC select HAVE_ARCH_KGDB + select ARCH_HAVE_CUSTOM_GPIO_H select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_DYNAMIC_FTRACE diff --git a/arch/openrisc/include/asm/gpio.h b/arch/openrisc/include/asm/gpio.h index 0b0d174f47cd..b3799d88ffcf 100644 --- a/arch/openrisc/include/asm/gpio.h +++ b/arch/openrisc/include/asm/gpio.h @@ -1,65 +1,4 @@ -/* - * OpenRISC Linux - * - * Linux architectural port borrowing liberally from similar works of - * others. All original copyrights apply as per the original source - * declaration. - * - * OpenRISC implementation: - * Copyright (C) 2003 Matjaz Breskvar - * Copyright (C) 2010-2011 Jonas Bonn - * et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef __ASM_OPENRISC_GPIO_H -#define __ASM_OPENRISC_GPIO_H - -#include -#include - -#ifdef CONFIG_GPIOLIB - -/* - * OpenRISC (or1k) does not have on-chip GPIO's so there is not really - * any standardized implementation that makes sense here. If passing - * through gpiolib becomes a bottleneck then it may make sense, on a - * case-by-case basis, to implement these inlined/rapid versions. - * - * Just call gpiolib. - */ -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -/* - * Not implemented, yet. - */ -static inline int gpio_to_irq(unsigned int gpio) -{ - return -ENOSYS; -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* __ASM_OPENRISC_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/arch/powerpc/include/asm/gpio.h b/arch/powerpc/include/asm/gpio.h index 38762edb5e58..b3799d88ffcf 100644 --- a/arch/powerpc/include/asm/gpio.h +++ b/arch/powerpc/include/asm/gpio.h @@ -1,53 +1,4 @@ -/* - * Generic GPIO API implementation for PowerPC. - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef __ASM_POWERPC_GPIO_H -#define __ASM_POWERPC_GPIO_H - -#include -#include - -#ifdef CONFIG_GPIOLIB - -/* - * We don't (yet) implement inlined/rapid versions for on-chip gpios. - * Just call gpiolib. - */ -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return __gpio_to_irq(gpio); -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* __ASM_POWERPC_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index ff9e033ce626..c40b29ac3644 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -13,6 +13,7 @@ config SUPERH select HAVE_DMA_ATTRS select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select PERF_USE_VMALLOC select HAVE_KERNEL_GZIP diff --git a/arch/sparc/include/asm/gpio.h b/arch/sparc/include/asm/gpio.h index a0e3ac0af599..b3799d88ffcf 100644 --- a/arch/sparc/include/asm/gpio.h +++ b/arch/sparc/include/asm/gpio.h @@ -1,36 +1,4 @@ -#ifndef __ASM_SPARC_GPIO_H -#define __ASM_SPARC_GPIO_H - -#include -#include - -#ifdef CONFIG_GPIOLIB - -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return -ENOSYS; -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* __ASM_SPARC_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index eeb8054c7cd8..7ff6d10c0be2 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -8,6 +8,7 @@ config UNICORE32 select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZO select HAVE_KERNEL_LZMA + select ARCH_HAVE_CUSTOM_GPIO_H select GENERIC_FIND_FIRST_BIT select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h index 91d915a65259..b3799d88ffcf 100644 --- a/arch/x86/include/asm/gpio.h +++ b/arch/x86/include/asm/gpio.h @@ -1,53 +1,4 @@ -/* - * Generic GPIO API implementation for x86. - * - * Derived from the generic GPIO API for powerpc: - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef _ASM_X86_GPIO_H -#define _ASM_X86_GPIO_H - -#include - -#ifdef CONFIG_GPIOLIB - -/* - * Just call gpiolib. - */ -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return __gpio_to_irq(gpio); -} - -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* _ASM_X86_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/arch/xtensa/include/asm/gpio.h b/arch/xtensa/include/asm/gpio.h index a8c9fc46c790..b3799d88ffcf 100644 --- a/arch/xtensa/include/asm/gpio.h +++ b/arch/xtensa/include/asm/gpio.h @@ -1,56 +1,4 @@ -/* - * Generic GPIO API implementation for xtensa. - * - * Stolen from x86, which is derived from the generic GPIO API for powerpc: - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef _ASM_XTENSA_GPIO_H -#define _ASM_XTENSA_GPIO_H - -#include - -#ifdef CONFIG_GPIOLIB - -/* - * Just call gpiolib. - */ -static inline int gpio_get_value(unsigned int gpio) -{ - return __gpio_get_value(gpio); -} - -static inline void gpio_set_value(unsigned int gpio, int value) -{ - __gpio_set_value(gpio, value); -} - -static inline int gpio_cansleep(unsigned int gpio) -{ - return __gpio_cansleep(gpio); -} - -static inline int gpio_to_irq(unsigned int gpio) -{ - return __gpio_to_irq(gpio); -} - -/* - * Not implemented, yet. - */ -static inline int irq_to_gpio(unsigned int irq) -{ - return -EINVAL; -} - -#endif /* CONFIG_GPIOLIB */ - -#endif /* _ASM_XTENSA_GPIO_H */ +#ifndef __LINUX_GPIO_H +#warning Include linux/gpio.h instead of asm/gpio.h +#include +#endif diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 5169a99e9f61..25535ebf4f90 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -2,6 +2,14 @@ # GPIO infrastructure and drivers # +config ARCH_HAVE_CUSTOM_GPIO_H + bool + help + Selecting this config option from the architecture Kconfig allows + the architecture to provide a custom asm/gpio.h implementation + overriding the default implementations. New uses of this are + strongly discouraged. + config ARCH_WANT_OPTIONAL_GPIOLIB bool help diff --git a/include/linux/gpio.h b/include/linux/gpio.h index d1890d46b6ce..7a8816a1a0d8 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -1,6 +1,8 @@ #ifndef __LINUX_GPIO_H #define __LINUX_GPIO_H +#include + /* see Documentation/gpio.txt */ /* make these flag values available regardless of GPIO kconfig options */ @@ -38,7 +40,39 @@ struct gpio { }; #ifdef CONFIG_GENERIC_GPIO + +#ifdef CONFIG_ARCH_HAVE_CUSTOM_GPIO_H #include +#else + +#include + +static inline int gpio_get_value(unsigned int gpio) +{ + return __gpio_get_value(gpio); +} + +static inline void gpio_set_value(unsigned int gpio, int value) +{ + __gpio_set_value(gpio, value); +} + +static inline int gpio_cansleep(unsigned int gpio) +{ + return __gpio_cansleep(gpio); +} + +static inline int gpio_to_irq(unsigned int gpio) +{ + return __gpio_to_irq(gpio); +} + +static inline int irq_to_gpio(unsigned int irq) +{ + return -EINVAL; +} + +#endif #else -- cgit v1.2.3 From 5f3fbc342f408199e5cbb4b3dc220569147a99a7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 14 May 2012 14:58:58 +0800 Subject: KVM: VMX: unlike vmcs on fail path fix: [ 1529.577273] Call Trace: [ 1529.577289] [] kvm_arch_hardware_disable+0x13/0x30 [kvm] [ 1529.577302] [] hardware_disable_nolock+0x35/0x39 [kvm] [ 1529.577311] [] ? cpumask_clear_cpu.constprop.31+0x13/0x13 [kvm] [ 1529.577315] [] on_each_cpu+0x44/0x84 [ 1529.577326] [] hardware_disable_all_nolock+0x34/0x36 [kvm] [ 1529.577335] [] hardware_disable_all+0x2b/0x39 [kvm] [ 1529.577349] [] kvm_put_kvm+0xed/0x10f [kvm] [ 1529.577358] [] kvm_vm_release+0x22/0x28 [kvm] Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 61ebdb6390ee..3062ea95266e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6350,7 +6350,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vmcs(vmx->loaded_vmcs->vmcs); + free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); uninit_vcpu: -- cgit v1.2.3 From d54e4237bcbb400fda11c902fd538aa0b4805720 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 7 May 2012 12:12:25 +0200 Subject: KVM: x86 emulator: convert bsf/bsr instructions to emulate_2op_SrcV_nobyte() The instruction emulation for bsrw is broken in KVM because the code always uses bsr with 32 or 64 bit operand size for emulation. Fix that by using emulate_2op_SrcV_nobyte() macro to use guest operand size for emulation. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7fd25763b0e0..f95d242ee9f7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3133,35 +3133,13 @@ static int em_btc(struct x86_emulate_ctxt *ctxt) static int em_bsf(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsf %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } + emulate_2op_SrcV_nobyte(ctxt, "bsf"); return X86EMUL_CONTINUE; } static int em_bsr(struct x86_emulate_ctxt *ctxt) { - u8 zf; - - __asm__ ("bsr %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - } + emulate_2op_SrcV_nobyte(ctxt, "bsr"); return X86EMUL_CONTINUE; } -- cgit v1.2.3 From 5d2b86d90f7cc4a41316cef3d41560da6141f45c Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 11 May 2012 14:41:13 -0400 Subject: Revert "x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus" This reverts commit 3603a2512f9e69dc87914ba922eb4a0812b21cd6. Originally I wanted a better hammer to shutdown cpus during panic. However, this really steps on the toes of various spinlocks in the panic path. Sometimes it is easier to wait for the IRQ to become re-enabled to indictate the cpu left the critical region and then shutdown the cpu. The next patch moves the NMI addition after the IRQ part. To make it easier to see the logic of everything, revert this patch and apply the next simpler patch. Signed-off-by: Don Zickus Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1336761675-24296-2-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 59 ++------------------------------------------------- 1 file changed, 2 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 66c74f481cab..6d20f523bc4e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -29,7 +29,6 @@ #include #include #include -#include /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -149,60 +148,6 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } -static atomic_t stopping_cpu = ATOMIC_INIT(-1); - -static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) -{ - /* We are registered on stopping cpu too, avoid spurious NMI */ - if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) - return NMI_HANDLED; - - stop_this_cpu(NULL); - - return NMI_HANDLED; -} - -static void native_nmi_stop_other_cpus(int wait) -{ - unsigned long flags; - unsigned long timeout; - - if (reboot_force) - return; - - /* - * Use an own vector here because smp_call_function - * does lots of things not suitable in a panic situation. - */ - if (num_online_cpus() > 1) { - /* did someone beat us here? */ - if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) - return; - - if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, - NMI_FLAG_FIRST, "smp_stop")) - /* Note: we ignore failures here */ - return; - - /* sync above data before sending NMI */ - wmb(); - - apic->send_IPI_allbutself(NMI_VECTOR); - - /* - * Don't wait longer than a second if the caller - * didn't ask us to wait. - */ - timeout = USEC_PER_SEC; - while (num_online_cpus() > 1 && (wait || timeout--)) - udelay(1); - } - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); -} - /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -215,7 +160,7 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_irq_stop_other_cpus(int wait) +static void native_stop_other_cpus(int wait) { unsigned long flags; unsigned long timeout; @@ -298,7 +243,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .stop_other_cpus = native_nmi_stop_other_cpus, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, -- cgit v1.2.3 From 7d007d21e539dbecb6942c5734e6649f720982cf Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 11 May 2012 14:41:14 -0400 Subject: x86/reboot: Use NMI to assist in shutting down if IRQ fails For v3.3, I added code to use the NMI to stop other cpus in the panic case. The idea was to make sure all cpus on the system were definitely halted to help serialize the panic path to execute the rest of the code on a single cpu. The main problem it was trying to solve was how to stop a cpu that was spinning with its irqs disabled. A IPI irq would be stuck and couldn't get in there, but an NMI could. Things were great until we had another conversation about some pstore changes. Because some of the backend pstore still uses spinlocks to protect the device access, things could get ugly if a panic happened and we were stuck spinning on a lock. Now with the NMI shutting down cpus, we could assume no other cpus were running and just bust the spin lock and proceed. The counter argument was, well if you do that the backend could be in a screwed up state and you might not be able to save anything as a result. If we could have just given the cpu a little more time to finish things, we could have grabbed the spin lock cleanly and everything would have been fine. Well, how do give a cpu a 'little more time' in the panic case? For the most part you can't without spinning on the lock and even in that case, how long do you spin for? So instead of making it ugly in the pstore code, just mimic the idea that stop_machine had, which is block on an IRQ IPI until the remote cpu has re-enabled interrupts and left the critical region. Which is what happens now using REBOOT_IRQ. Then leave the NMI case for those cpus that are truly stuck after a short time. This leaves the current behaviour alone and just handle a corner case. Most systems should never have to enter the NMI code and if they do, print out a message in case the NMI itself causes another issue. Signed-off-by: Don Zickus Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1336761675-24296-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 6d20f523bc4e..228e7405511a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -108,6 +109,8 @@ * about nothing of note with C stepping upwards. */ +static atomic_t stopping_cpu = ATOMIC_INIT(-1); + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing @@ -148,6 +151,17 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -171,13 +185,25 @@ static void native_stop_other_cpus(int wait) /* * Use an own vector here because smp_call_function * does lots of things not suitable in a panic situation. - * On most systems we could also use an NMI here, - * but there are a few systems around where NMI - * is problematic so stay with an non NMI for now - * (this implies we cannot stop CPUs spinning with irq off - * currently) + */ + + /* + * We start by using the REBOOT_VECTOR irq. + * The irq is treated as a sync point to allow critical + * regions of code on other cpus to release their spin locks + * and re-enable irqs. Jumping straight to an NMI might + * accidentally cause deadlocks with further shutdown/panic + * code. By syncing, we give the cpus up to one second to + * finish their work before we force them off with the NMI. */ if (num_online_cpus() > 1) { + /* did someone beat us here? */ + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) + return; + + /* sync above data before sending IRQ */ + wmb(); + apic->send_IPI_allbutself(REBOOT_VECTOR); /* @@ -188,7 +214,32 @@ static void native_stop_other_cpus(int wait) while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } + + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if ((num_online_cpus() > 1)) { + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + /* Hope the REBOOT_IRQ is good enough */ + goto finish; + + /* sync above data before sending IRQ */ + wmb(); + + pr_emerg("Shutting down cpus with NMI\n"); + + apic->send_IPI_allbutself(NMI_VECTOR); + + /* + * Don't wait longer than a 10 ms if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_MSEC * 10; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } +finish: local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); -- cgit v1.2.3 From 3aac27aba79b7c52e709ef6de0f7d8139caedc01 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 11 May 2012 14:41:15 -0400 Subject: x86/reboot: Update nonmi_ipi parameter Update the nonmi_ipi parameter to reflect the simple change instead of the previous complicated one. There should be less of a need to use it but there may still be corner cases on older hardware that stumble into NMI issues. Signed-off-by: Don Zickus Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1336761675-24296-4-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 228e7405511a..48d2b7ded422 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -110,6 +110,7 @@ */ static atomic_t stopping_cpu = ATOMIC_INIT(-1); +static bool smp_no_nmi_ipi = false; /* * this function sends a 'reschedule' IPI to another CPU. @@ -216,7 +217,7 @@ static void native_stop_other_cpus(int wait) } /* if the REBOOT_VECTOR didn't work, try with the NMI */ - if ((num_online_cpus() > 1)) { + if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, NMI_FLAG_FIRST, "smp_stop")) /* Note: we ignore failures here */ @@ -245,11 +246,6 @@ finish: local_irq_restore(flags); } -static void native_smp_disable_nmi_ipi(void) -{ - smp_ops.stop_other_cpus = native_irq_stop_other_cpus; -} - /* * Reschedule call back. */ @@ -283,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) static int __init nonmi_ipi_setup(char *str) { - native_smp_disable_nmi_ipi(); - return 1; + smp_no_nmi_ipi = true; + return 1; } __setup("nonmi_ipi", nonmi_ipi_setup); -- cgit v1.2.3 From 978da300c7a65494692b329a6a4cbf364afc37c5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 11 May 2012 11:44:59 +0200 Subject: perf/x86/ibs: Fix undefined reference to `get_ibs_caps' Fixing i386 allnoconfig built errors: arch/x86/built-in.o: In function `amd_pmu_hw_config': perf_event_amd.c:(.text+0xc3e1): undefined reference to `get_ibs_caps' Reported-by: Andrew Morton Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 4e40a64315c9..588f52ea810e 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,7 +188,11 @@ struct x86_pmu_capability { #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_RIP_INVALID (1ULL<<38) +#ifdef CONFIG_X86_LOCAL_APIC extern u32 get_ibs_caps(void); +#else +static inline u32 get_ibs_caps(void) { return 0; } +#endif #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); -- cgit v1.2.3 From ead91d4b8c3b1fb08a73aaa4a191230ecf717ee0 Mon Sep 17 00:00:00 2001 From: Shai Fultheim Date: Mon, 16 Apr 2012 10:39:35 +0300 Subject: x86/vsmp: Fix number of CPUs when vsmp is disabled In case CONFIG_X86_VSMP is not set, limit the number of CPUs to the number of CPUs of the first board. Also make CONFIG_X86_VSMP depend on CONFIG_SMP, as there's little point in having a vsmp machine with a single CPU. Signed-off-by: Shai Fultheim [ido@wizery.com: rebased, fixed minor coding-style issues] Signed-off-by: Ido Yariv Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/vsmp_64.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f9ed801abaf9..d2599a0ea208 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -376,6 +376,7 @@ config X86_VSMP select PARAVIRT depends on X86_64 && PCI depends on X86_EXTENDED_PLATFORM + depends on SMP ---help--- Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is supposed to run on these EM64T-based machines. Only choose this option diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index a1d804bcd483..8eeb55a551b4 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -22,6 +23,8 @@ #include #include +#define TOPOLOGY_REGISTER_OFFSET 0x10 + #if defined CONFIG_PCI && defined CONFIG_PARAVIRT /* * Interrupt control on vSMPowered systems: @@ -149,12 +152,49 @@ int is_vsmp_box(void) return 0; } #endif + +static void __init vsmp_cap_cpus(void) +{ +#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) + void __iomem *address; + unsigned int cfg, topology, node_shift, maxcpus; + + /* + * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the + * ones present in the first board, unless explicitly overridden by + * setup_max_cpus + */ + if (setup_max_cpus != NR_CPUS) + return; + + /* Read the vSMP Foundation topology register */ + cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); + address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4); + if (WARN_ON(!address)) + return; + + topology = readl(address); + node_shift = (topology >> 16) & 0x7; + if (!node_shift) + /* The value 0 should be decoded as 8 */ + node_shift = 8; + maxcpus = (topology & ((1 << node_shift) - 1)) + 1; + + pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n", + maxcpus); + setup_max_cpus = maxcpus; + early_iounmap(address, 4); +#endif +} + void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; + vsmp_cap_cpus(); + set_vsmp_pv_ops(); return; } -- cgit v1.2.3 From 316ad248307fba13be40f01e92a22b89457c32bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 May 2012 13:05:59 +0200 Subject: sched/x86: Rewrite set_cpu_sibling_map() Commit ad7687dde ("x86/numa: Check for nonsensical topologies on real hw as well") is broken in that the condition can trigger for valid setups but only changes the end result for invalid setups with no real means of discerning between those. Rewrite set_cpu_sibling_map() to make the code clearer and make sure to only warn when the check changes the end result. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-klcwahu3gx467uhfiqjyhdcs@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 112 +++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7c53d96d44ab..e84c1bbea339 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -315,70 +315,90 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +static bool __cpuinit +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) { - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); - cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " + "[node: %d != %d]. Ignoring dependency.\n", + cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); } +#define link_mask(_m, c1, c2) \ +do { \ + cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ + cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ +} while (0) -void __cpuinit set_cpu_sibling_map(int cpu) +static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - int i; - struct cpuinfo_x86 *c = &cpu_data(cpu); + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; - cpumask_set_cpu(cpu, cpu_sibling_setup_mask); + if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && + c->compute_unit_id == o->compute_unit_id) + return topology_sane(c, o, "smt"); - if (smp_num_siblings > 1) { - for_each_cpu(i, cpu_sibling_setup_mask) { - struct cpuinfo_x86 *o = &cpu_data(i); + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + return topology_sane(c, o, "smt"); + } - if (cpu_to_node(cpu) != cpu_to_node(i)) { - WARN_ONCE(1, "sched: CPU #%d's thread-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i)); - continue; - } + return false; +} - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { - if (c->phys_proc_id == o->phys_proc_id && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && - c->compute_unit_id == o->compute_unit_id) - link_thread_siblings(cpu, i); - } else if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - link_thread_siblings(cpu, i); - } - } - } else { - cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); - } +static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) + return topology_sane(c, o, "llc"); + + return false; +} + +static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) + return topology_sane(c, o, "mc"); + + return false; +} + +void __cpuinit set_cpu_sibling_map(int cpu) +{ + bool has_mc = boot_cpu_data.x86_max_cores > 1; + bool has_smt = smp_num_siblings > 1; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cpuinfo_x86 *o; + int i; - cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_sibling_setup_mask); - if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { - cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); + if (!has_smt && !has_mc) { + cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); c->booted_cores = 1; return; } for_each_cpu(i, cpu_sibling_setup_mask) { - if (cpu_to_node(cpu) != cpu_to_node(i)) { - WARN_ONCE(1, "sched: CPU #%d's core-sibling CPU #%d not on the same node! [node %d != %d]. Ignoring sibling dependency.\n", cpu, i, cpu_to_node(cpu), cpu_to_node(i)); - continue; - } + o = &cpu_data(i); - if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && - per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); - cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); - } + if ((i == cpu) || (has_smt && match_smt(c, o))) + link_mask(sibling, cpu, i); + + if ((i == cpu) || (has_mc && match_llc(c, o))) + link_mask(llc_shared, cpu, i); + + if ((i == cpu) || (has_mc && match_mc(c, o))) { + link_mask(core, cpu, i); - if (c->phys_proc_id == cpu_data(i).phys_proc_id) { - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); /* * Does this new cpu bringup a new core? */ -- cgit v1.2.3 From c6ae41e7d469f00d9c92a2b2887c7235d121c009 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 11 May 2012 15:35:27 +0800 Subject: x86: replace percpu_xxx funcs with this_cpu_xxx Since percpu_xxx() serial functions are duplicated with this_cpu_xxx(). Removing percpu_xxx() definition and replacing them by this_cpu_xxx() in code. There is no function change in this patch, just preparation for later percpu_xxx serial function removing. On x86 machine the this_cpu_xxx() serial functions are same as __this_cpu_xxx() without no unnecessary premmpt enable/disable. Thanks for Stephen Rothwell, he found and fixed a i386 build error in the patch. Also thanks for Andrew Morton, he kept updating the patchset in Linus' tree. Signed-off-by: Alex Shi Acked-by: Christoph Lameter Acked-by: Tejun Heo Acked-by: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Tejun Heo --- arch/x86/include/asm/compat.h | 2 +- arch/x86/include/asm/current.h | 2 +- arch/x86/include/asm/desc.h | 1 + arch/x86/include/asm/fpu-internal.h | 6 +++--- arch/x86/include/asm/hardirq.h | 9 +++++---- arch/x86/include/asm/irq_regs.h | 4 ++-- arch/x86/include/asm/mmu_context.h | 12 ++++++------ arch/x86/include/asm/percpu.h | 8 ++++---- arch/x86/include/asm/smp.h | 4 ++-- arch/x86/include/asm/stackprotector.h | 4 ++-- arch/x86/include/asm/thread_info.h | 2 +- arch/x86/include/asm/tlbflush.h | 4 ++-- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- arch/x86/kernel/i387.c | 2 +- arch/x86/kernel/nmi_selftest.c | 1 + arch/x86/kernel/paravirt.c | 12 ++++++------ arch/x86/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 10 +++++----- arch/x86/mm/tlb.c | 10 +++++----- include/linux/topology.h | 4 ++-- 22 files changed, 55 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index d6805798d6fc..fedf32b73e65 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -229,7 +229,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ - sp = percpu_read(old_rsp) - 128; + sp = this_cpu_read(old_rsp) - 128; } return (void __user *)round_down(sp - len, 16); diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index 4d447b732d82..9476c04ee635 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { - return percpu_read_stable(current_task); + return this_cpu_read_stable(current_task); } #define current get_current() diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e95822d683f4..8bf1c06070d5 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -6,6 +6,7 @@ #include #include +#include static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) { diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 4fa88154e4de..75f4c6d6a331 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -290,14 +290,14 @@ static inline int __thread_has_fpu(struct task_struct *tsk) static inline void __thread_clear_has_fpu(struct task_struct *tsk) { tsk->thread.fpu.has_fpu = 0; - percpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_owner_task, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { tsk->thread.fpu.has_fpu = 1; - percpu_write(fpu_owner_task, tsk); + this_cpu_write(fpu_owner_task, tsk); } /* @@ -344,7 +344,7 @@ typedef struct { int preload; } fpu_switch_t; */ static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { - return new == percpu_read_stable(fpu_owner_task) && + return new == this_cpu_read_stable(fpu_owner_task) && cpu == new->thread.fpu.last_cpu; } diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 382f75d735f3..d3895dbf4ddb 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -35,14 +35,15 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT -#define inc_irq_stat(member) percpu_inc(irq_stat.member) +#define inc_irq_stat(member) this_cpu_inc(irq_stat.member) -#define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) +#define local_softirq_pending() this_cpu_read(irq_stat.__softirq_pending) #define __ARCH_SET_SOFTIRQ_PENDING -#define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x)) -#define or_softirq_pending(x) percpu_or(irq_stat.__softirq_pending, (x)) +#define set_softirq_pending(x) \ + this_cpu_write(irq_stat.__softirq_pending, (x)) +#define or_softirq_pending(x) this_cpu_or(irq_stat.__softirq_pending, (x)) extern void ack_bad_irq(unsigned int irq); diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h index 77843225b7ea..d82250b1debb 100644 --- a/arch/x86/include/asm/irq_regs.h +++ b/arch/x86/include/asm/irq_regs.h @@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) { - return percpu_read(irq_regs); + return this_cpu_read(irq_regs); } static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) @@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) struct pt_regs *old_regs; old_regs = get_irq_regs(); - percpu_write(irq_regs, new_regs); + this_cpu_write(irq_regs, new_regs); return old_regs; } diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 69021528b43c..cdbf36776106 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -25,8 +25,8 @@ void destroy_context(struct mm_struct *mm); static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -37,8 +37,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (likely(prev != next)) { #ifdef CONFIG_SMP - percpu_write(cpu_tlbstate.state, TLBSTATE_OK); - percpu_write(cpu_tlbstate.active_mm, next); + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); #endif cpumask_set_cpu(cpu, mm_cpumask(next)); @@ -56,8 +56,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, } #ifdef CONFIG_SMP else { - percpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 7a11910a63c4..967ee3be5c0a 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -46,7 +46,7 @@ #ifdef CONFIG_SMP #define __percpu_prefix "%%"__stringify(__percpu_seg)":" -#define __my_cpu_offset percpu_read(this_cpu_off) +#define __my_cpu_offset this_cpu_read(this_cpu_off) /* * Compared to the generic __my_cpu_offset version, the following @@ -352,15 +352,15 @@ do { \ /* * percpu_read() makes gcc load the percpu variable every time it is - * accessed while percpu_read_stable() allows the value to be cached. - * percpu_read_stable() is more efficient and can be used if its value + * accessed while this_cpu_read_stable() allows the value to be cached. + * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across cpus. The current users include * get_current() and get_thread_info() both of which are actually * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ #define percpu_read(var) percpu_from_op("mov", var, "m" (var)) -#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) +#define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) #define percpu_write(var, val) percpu_to_op("mov", var, val) #define percpu_add(var, val) percpu_add_op(var, val) #define percpu_sub(var, val) percpu_add_op(var, -(val)) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 0434c400287c..e276f6bb6524 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -188,11 +188,11 @@ extern unsigned disabled_cpus __cpuinitdata; * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (percpu_read(cpu_number)) +#define raw_smp_processor_id() (this_cpu_read(cpu_number)) extern int safe_smp_processor_id(void); #elif defined(CONFIG_X86_64_SMP) -#define raw_smp_processor_id() (percpu_read(cpu_number)) +#define raw_smp_processor_id() (this_cpu_read(cpu_number)) #define stack_smp_processor_id() \ ({ \ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index b5d9533d2c38..6a998598f172 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -75,9 +75,9 @@ static __always_inline void boot_init_stack_canary(void) current->stack_canary = canary; #ifdef CONFIG_X86_64 - percpu_write(irq_stack_union.stack_canary, canary); + this_cpu_write(irq_stack_union.stack_canary, canary); #else - percpu_write(stack_canary.canary, canary); + this_cpu_write(stack_canary.canary, canary); #endif } diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad6df8ccd715..f67fd89c874b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -222,7 +222,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - ti = (void *)(percpu_read_stable(kernel_stack) + + ti = (void *)(this_cpu_read_stable(kernel_stack) + KERNEL_STACK_OFFSET - THREAD_SIZE); return ti; } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index c0e108e08079..1620d23f14d7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -156,8 +156,8 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); static inline void reset_lazy_tlbstate(void) { - percpu_write(cpu_tlbstate.state, 0); - percpu_write(cpu_tlbstate.active_mm, &init_mm); + this_cpu_write(cpu_tlbstate.state, 0); + this_cpu_write(cpu_tlbstate.active_mm, &init_mm); } #endif /* SMP */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cf79302198a6..82f29e70d058 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1185,7 +1185,7 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(numa_node) == 0 && + if (cpu != 0 && this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d086a09c087d..c0276d5d9bd4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -583,7 +583,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - percpu_inc(mce_poll_count); + this_cpu_inc(mce_poll_count); mce_gather_info(&m, NULL); @@ -1015,7 +1015,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - percpu_inc(mce_exception_count); + this_cpu_inc(mce_exception_count); if (!banks) goto out; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2d6e6498c176..f250431fb505 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -88,7 +88,7 @@ void kernel_fpu_begin(void) __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else { - percpu_write(fpu_owner_task, NULL); + this_cpu_write(fpu_owner_task, NULL); clts(); } } diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 2c39dcd510fa..ff3698625081 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ab137605e694..9ce885996fd7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - percpu_write(paravirt_lazy_mode, mode); + this_cpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(percpu_read(paravirt_lazy_mode) != mode); + BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); - percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); + this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev) { BUG_ON(preemptible()); - if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } @@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return percpu_read(paravirt_lazy_mode); + return this_cpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d92a5ab6e8b..857adffb7080 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -377,7 +377,7 @@ static inline void play_dead(void) #ifdef CONFIG_X86_64 void enter_idle(void) { - percpu_write(is_idle, 1); + this_cpu_write(is_idle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ae6847303e26..01d8d40ccaf6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -302,7 +302,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) switch_fpu_finish(next_p, fpu); - percpu_write(current_task, next_p); + this_cpu_write(current_task, next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 43d8b48b23e6..28e810255a0a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -237,7 +237,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - percpu_write(old_rsp, new_sp); + this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; @@ -359,11 +359,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = percpu_read(old_rsp); - percpu_write(old_rsp, next->usersp); - percpu_write(current_task, next_p); + prev->usersp = this_cpu_read(old_rsp); + this_cpu_write(old_rsp, next->usersp); + this_cpu_write(current_task, next_p); - percpu_write(kernel_stack, + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index d6c0418c3e47..3804471db104 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -61,10 +61,10 @@ static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); */ void leave_mm(int cpu) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); cpumask_clear_cpu(cpu, - mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); + mm_cpumask(this_cpu_read(cpu_tlbstate.active_mm))); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -152,8 +152,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) { + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -322,7 +322,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) static void do_flush_tlb_all(void *info) { __flush_tlb_all(); - if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); } diff --git a/include/linux/topology.h b/include/linux/topology.h index e26db031303b..9dc427cdb6ff 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -239,7 +239,7 @@ static inline int cpu_to_node(int cpu) #ifndef set_numa_node static inline void set_numa_node(int node) { - percpu_write(numa_node, node); + this_cpu_write(numa_node, node); } #endif @@ -274,7 +274,7 @@ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { - percpu_write(_numa_mem_, node); + this_cpu_write(_numa_mem_, node); } #endif -- cgit v1.2.3 From 641b695c2f11397bd307ea689d4d3f128360ce49 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 14 May 2012 14:15:32 -0700 Subject: percpu: remove percpu_xxx() functions Remove percpu_xxx serial functions, all of them were replaced by this_cpu_xxx or __this_cpu_xxx serial functions Signed-off-by: Alex Shi Acked-by: Christoph Lameter Acked-by: Tejun Heo Acked-by: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Tejun Heo --- arch/x86/include/asm/percpu.h | 16 +++++-------- include/linux/percpu.h | 54 ------------------------------------------- 2 files changed, 6 insertions(+), 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 967ee3be5c0a..d9b8e3f7f42a 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -351,7 +351,7 @@ do { \ }) /* - * percpu_read() makes gcc load the percpu variable every time it is + * this_cpu_read() makes gcc load the percpu variable every time it is * accessed while this_cpu_read_stable() allows the value to be cached. * this_cpu_read_stable() is more efficient and can be used if its value * is guaranteed to be valid across cpus. The current users include @@ -359,15 +359,7 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define percpu_read(var) percpu_from_op("mov", var, "m" (var)) #define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) -#define percpu_write(var, val) percpu_to_op("mov", var, val) -#define percpu_add(var, val) percpu_add_op(var, val) -#define percpu_sub(var, val) percpu_add_op(var, -(val)) -#define percpu_and(var, val) percpu_to_op("and", var, val) -#define percpu_or(var, val) percpu_to_op("or", var, val) -#define percpu_xor(var, val) percpu_to_op("xor", var, val) -#define percpu_inc(var) percpu_unary_op("inc", var) #define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) #define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) @@ -512,7 +504,11 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, { unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; - return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0; +#ifdef CONFIG_X86_64 + return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_8(*a)) != 0; +#else + return ((1UL << (nr % BITS_PER_LONG)) & __this_cpu_read_4(*a)) != 0; +#endif } static inline int x86_this_cpu_variable_test_bit(int nr, diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 21638ae14e07..2b9f82c037c9 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -165,60 +165,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); #define alloc_percpu(type) \ (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) -/* - * Optional methods for optimized non-lvalue per-cpu variable access. - * - * @var can be a percpu variable or a field of it and its size should - * equal char, int or long. percpu_read() evaluates to a lvalue and - * all others to void. - * - * These operations are guaranteed to be atomic. - * The generic versions disable interrupts. Archs are - * encouraged to implement single-instruction alternatives which don't - * require protection. - */ -#ifndef percpu_read -# define percpu_read(var) \ - ({ \ - typeof(var) *pr_ptr__ = &(var); \ - typeof(var) pr_ret__; \ - pr_ret__ = get_cpu_var(*pr_ptr__); \ - put_cpu_var(*pr_ptr__); \ - pr_ret__; \ - }) -#endif - -#define __percpu_generic_to_op(var, val, op) \ -do { \ - typeof(var) *pgto_ptr__ = &(var); \ - get_cpu_var(*pgto_ptr__) op val; \ - put_cpu_var(*pgto_ptr__); \ -} while (0) - -#ifndef percpu_write -# define percpu_write(var, val) __percpu_generic_to_op(var, (val), =) -#endif - -#ifndef percpu_add -# define percpu_add(var, val) __percpu_generic_to_op(var, (val), +=) -#endif - -#ifndef percpu_sub -# define percpu_sub(var, val) __percpu_generic_to_op(var, (val), -=) -#endif - -#ifndef percpu_and -# define percpu_and(var, val) __percpu_generic_to_op(var, (val), &=) -#endif - -#ifndef percpu_or -# define percpu_or(var, val) __percpu_generic_to_op(var, (val), |=) -#endif - -#ifndef percpu_xor -# define percpu_xor(var, val) __percpu_generic_to_op(var, (val), ^=) -#endif - /* * Branching function to split up a function into a set of functions that * are called for different scalar sizes of the objects handled. -- cgit v1.2.3 From fa46ccb8eb960c62c1e5e3237085d4007788a345 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 11 May 2012 16:00:48 +0300 Subject: crypto: aesni-intel - use crypto_[un]register_algs Combine all crypto_alg to be registered and use new crypto_[un]register_algs functions. Simplifies init/exit code and reduce object size. Cc: Huang Ying Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 727 ++++++++++++++++--------------------- 1 file changed, 305 insertions(+), 422 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index c799352e24fc..20c622016629 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -222,27 +222,6 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) } } -static struct crypto_alg aesni_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-aesni", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(aesni_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt - } - } -}; - static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); @@ -257,27 +236,6 @@ static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) aesni_dec(ctx, dst, src); } -static struct crypto_alg __aesni_alg = { - .cra_name = "__aes-aesni", - .cra_driver_name = "__driver-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = __aes_encrypt, - .cia_decrypt = __aes_decrypt - } - } -}; - static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -326,28 +284,6 @@ static int ecb_decrypt(struct blkcipher_desc *desc, return err; } -static struct crypto_alg blk_ecb_alg = { - .cra_name = "__ecb-aes-aesni", - .cra_driver_name = "__driver-ecb-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}; - static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -396,28 +332,6 @@ static int cbc_decrypt(struct blkcipher_desc *desc, return err; } -static struct crypto_alg blk_cbc_alg = { - .cra_name = "__cbc-aes-aesni", - .cra_driver_name = "__driver-cbc-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}; - #ifdef CONFIG_X86_64 static void ctr_crypt_final(struct crypto_aes_ctx *ctx, struct blkcipher_walk *walk) @@ -461,29 +375,6 @@ static int ctr_crypt(struct blkcipher_desc *desc, return err; } - -static struct crypto_alg blk_ctr_alg = { - .cra_name = "__ctr-aes-aesni", - .cra_driver_name = "__driver-ctr-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = aes_set_key, - .encrypt = ctr_crypt, - .decrypt = ctr_crypt, - }, - }, -}; #endif static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, @@ -572,30 +463,6 @@ static int ablk_ecb_init(struct crypto_tfm *tfm) return 0; } -static struct crypto_alg ablk_ecb_alg = { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list), - .cra_init = ablk_ecb_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}; - static int ablk_cbc_init(struct crypto_tfm *tfm) { struct cryptd_ablkcipher *cryptd_tfm; @@ -607,31 +474,6 @@ static int ablk_cbc_init(struct crypto_tfm *tfm) return 0; } -static struct crypto_alg ablk_cbc_alg = { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list), - .cra_init = ablk_cbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}; - #ifdef CONFIG_X86_64 static int ablk_ctr_init(struct crypto_tfm *tfm) { @@ -644,32 +486,6 @@ static int ablk_ctr_init(struct crypto_tfm *tfm) return 0; } -static struct crypto_alg ablk_ctr_alg = { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), - .cra_init = ablk_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_encrypt, - .geniv = "chainiv", - }, - }, -}; - #ifdef HAS_CTR static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) { @@ -682,32 +498,6 @@ static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) ablk_init_common(tfm, cryptd_tfm); return 0; } - -static struct crypto_alg ablk_rfc3686_ctr_alg = { - .cra_name = "rfc3686(ctr(aes))", - .cra_driver_name = "rfc3686-ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list), - .cra_init = ablk_rfc3686_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE, - .max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE, - .ivsize = CTR_RFC3686_IV_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - .geniv = "seqiv", - }, - }, -}; #endif #endif @@ -723,31 +513,6 @@ static int ablk_lrw_init(struct crypto_tfm *tfm) ablk_init_common(tfm, cryptd_tfm); return 0; } - -static struct crypto_alg ablk_lrw_alg = { - .cra_name = "lrw(aes)", - .cra_driver_name = "lrw-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), - .cra_init = ablk_lrw_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}; #endif #ifdef HAS_PCBC @@ -762,31 +527,6 @@ static int ablk_pcbc_init(struct crypto_tfm *tfm) ablk_init_common(tfm, cryptd_tfm); return 0; } - -static struct crypto_alg ablk_pcbc_alg = { - .cra_name = "pcbc(aes)", - .cra_driver_name = "pcbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list), - .cra_init = ablk_pcbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}; #endif #ifdef HAS_XTS @@ -801,31 +541,6 @@ static int ablk_xts_init(struct crypto_tfm *tfm) ablk_init_common(tfm, cryptd_tfm); return 0; } - -static struct crypto_alg ablk_xts_alg = { - .cra_name = "xts(aes)", - .cra_driver_name = "xts-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), - .cra_init = ablk_xts_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = 2 * AES_MIN_KEY_SIZE, - .max_keysize = 2 * AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}; #endif #ifdef CONFIG_X86_64 @@ -1050,32 +765,6 @@ static int rfc4106_decrypt(struct aead_request *req) } } -static struct crypto_alg rfc4106_alg = { - .cra_name = "rfc4106(gcm(aes))", - .cra_driver_name = "rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, - .cra_alignmask = 0, - .cra_type = &crypto_nivaead_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list), - .cra_init = rfc4106_init, - .cra_exit = rfc4106_exit, - .cra_u = { - .aead = { - .setkey = rfc4106_set_key, - .setauthsize = rfc4106_set_authsize, - .encrypt = rfc4106_encrypt, - .decrypt = rfc4106_decrypt, - .geniv = "seqiv", - .ivsize = 8, - .maxauthsize = 16, - }, - }, -}; - static int __driver_rfc4106_encrypt(struct aead_request *req) { u8 one_entry_in_sg = 0; @@ -1233,26 +922,316 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) } return retval; } +#endif -static struct crypto_alg __rfc4106_alg = { - .cra_name = "__gcm-aes-aesni", - .cra_driver_name = "__driver-gcm-aes-aesni", +static struct crypto_alg aesni_algs[] = { { + .cra_name = "aes", + .cra_driver_name = "aes-aesni", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}, { + .cra_name = "__aes-aesni", + .cra_driver_name = "__driver-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = __aes_encrypt, + .cia_decrypt = __aes_decrypt + } + } +}, { + .cra_name = "__ecb-aes-aesni", + .cra_driver_name = "__driver-ecb-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-aes-aesni", + .cra_driver_name = "__driver-cbc-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_ecb_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_cbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +#ifdef CONFIG_X86_64 +}, { + .cra_name = "__ctr-aes-aesni", + .cra_driver_name = "__driver-ctr-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aes_set_key, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "__gcm-aes-aesni", + .cra_driver_name = "__driver-gcm-aes-aesni", .cra_priority = 0, .cra_flags = CRYPTO_ALG_TYPE_AEAD, .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + + AESNI_ALIGN, .cra_alignmask = 0, .cra_type = &crypto_aead_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list), .cra_u = { .aead = { .encrypt = __driver_rfc4106_encrypt, .decrypt = __driver_rfc4106_decrypt, }, }, -}; +}, { + .cra_name = "rfc4106(gcm(aes))", + .cra_driver_name = "rfc4106-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + + AESNI_ALIGN, + .cra_alignmask = 0, + .cra_type = &crypto_nivaead_type, + .cra_module = THIS_MODULE, + .cra_init = rfc4106_init, + .cra_exit = rfc4106_exit, + .cra_u = { + .aead = { + .setkey = rfc4106_set_key, + .setauthsize = rfc4106_set_authsize, + .encrypt = rfc4106_encrypt, + .decrypt = rfc4106_decrypt, + .geniv = "seqiv", + .ivsize = 8, + .maxauthsize = 16, + }, + }, +#ifdef HAS_CTR +}, { + .cra_name = "rfc3686(ctr(aes))", + .cra_driver_name = "rfc3686-ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_rfc3686_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + + CTR_RFC3686_NONCE_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + + CTR_RFC3686_NONCE_SIZE, + .ivsize = CTR_RFC3686_IV_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + .geniv = "seqiv", + }, + }, +#endif +#endif +#ifdef HAS_LRW +}, { + .cra_name = "lrw(aes)", + .cra_driver_name = "lrw-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_lrw_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +#endif +#ifdef HAS_PCBC +}, { + .cra_name = "pcbc(aes)", + .cra_driver_name = "pcbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_pcbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +#endif +#ifdef HAS_XTS +}, { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_xts_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, #endif +} }; static const struct x86_cpu_id aesni_cpu_id[] = { @@ -1263,120 +1242,24 @@ MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); static int __init aesni_init(void) { - int err; + int err, i; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; - if ((err = crypto_fpu_init())) - goto fpu_err; - if ((err = crypto_register_alg(&aesni_alg))) - goto aes_err; - if ((err = crypto_register_alg(&__aesni_alg))) - goto __aes_err; - if ((err = crypto_register_alg(&blk_ecb_alg))) - goto blk_ecb_err; - if ((err = crypto_register_alg(&blk_cbc_alg))) - goto blk_cbc_err; - if ((err = crypto_register_alg(&ablk_ecb_alg))) - goto ablk_ecb_err; - if ((err = crypto_register_alg(&ablk_cbc_alg))) - goto ablk_cbc_err; -#ifdef CONFIG_X86_64 - if ((err = crypto_register_alg(&blk_ctr_alg))) - goto blk_ctr_err; - if ((err = crypto_register_alg(&ablk_ctr_alg))) - goto ablk_ctr_err; - if ((err = crypto_register_alg(&__rfc4106_alg))) - goto __aead_gcm_err; - if ((err = crypto_register_alg(&rfc4106_alg))) - goto aead_gcm_err; -#ifdef HAS_CTR - if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) - goto ablk_rfc3686_ctr_err; -#endif -#endif -#ifdef HAS_LRW - if ((err = crypto_register_alg(&ablk_lrw_alg))) - goto ablk_lrw_err; -#endif -#ifdef HAS_PCBC - if ((err = crypto_register_alg(&ablk_pcbc_alg))) - goto ablk_pcbc_err; -#endif -#ifdef HAS_XTS - if ((err = crypto_register_alg(&ablk_xts_alg))) - goto ablk_xts_err; -#endif - return err; + err = crypto_fpu_init(); + if (err) + return err; -#ifdef HAS_XTS -ablk_xts_err: -#endif -#ifdef HAS_PCBC - crypto_unregister_alg(&ablk_pcbc_alg); -ablk_pcbc_err: -#endif -#ifdef HAS_LRW - crypto_unregister_alg(&ablk_lrw_alg); -ablk_lrw_err: -#endif -#ifdef CONFIG_X86_64 -#ifdef HAS_CTR - crypto_unregister_alg(&ablk_rfc3686_ctr_alg); -ablk_rfc3686_ctr_err: -#endif - crypto_unregister_alg(&rfc4106_alg); -aead_gcm_err: - crypto_unregister_alg(&__rfc4106_alg); -__aead_gcm_err: - crypto_unregister_alg(&ablk_ctr_alg); -ablk_ctr_err: - crypto_unregister_alg(&blk_ctr_alg); -blk_ctr_err: -#endif - crypto_unregister_alg(&ablk_cbc_alg); -ablk_cbc_err: - crypto_unregister_alg(&ablk_ecb_alg); -ablk_ecb_err: - crypto_unregister_alg(&blk_cbc_alg); -blk_cbc_err: - crypto_unregister_alg(&blk_ecb_alg); -blk_ecb_err: - crypto_unregister_alg(&__aesni_alg); -__aes_err: - crypto_unregister_alg(&aesni_alg); -aes_err: -fpu_err: - return err; + for (i = 0; i < ARRAY_SIZE(aesni_algs); i++) + INIT_LIST_HEAD(&aesni_algs[i].cra_list); + + return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); } static void __exit aesni_exit(void) { -#ifdef HAS_XTS - crypto_unregister_alg(&ablk_xts_alg); -#endif -#ifdef HAS_PCBC - crypto_unregister_alg(&ablk_pcbc_alg); -#endif -#ifdef HAS_LRW - crypto_unregister_alg(&ablk_lrw_alg); -#endif -#ifdef CONFIG_X86_64 -#ifdef HAS_CTR - crypto_unregister_alg(&ablk_rfc3686_ctr_alg); -#endif - crypto_unregister_alg(&rfc4106_alg); - crypto_unregister_alg(&__rfc4106_alg); - crypto_unregister_alg(&ablk_ctr_alg); - crypto_unregister_alg(&blk_ctr_alg); -#endif - crypto_unregister_alg(&ablk_cbc_alg); - crypto_unregister_alg(&ablk_ecb_alg); - crypto_unregister_alg(&blk_cbc_alg); - crypto_unregister_alg(&blk_ecb_alg); - crypto_unregister_alg(&__aesni_alg); - crypto_unregister_alg(&aesni_alg); + crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); crypto_fpu_exit(); } -- cgit v1.2.3 From ef45b834319f8a18f257a40ba4bce6b829ef1708 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Fri, 11 May 2012 16:00:54 +0300 Subject: crypto: aesni-intel - move more common code to ablk_init_common ablk_*_init functions share more common code than what is currently in ablk_init_common. Move all of the common code to ablk_init_common. Cc: Huang Ying Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 70 ++++++++------------------------------ 1 file changed, 15 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 20c622016629..ac7f5cd019e8 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -442,61 +442,42 @@ static void ablk_exit(struct crypto_tfm *tfm) cryptd_free_ablkcipher(ctx->cryptd_tfm); } -static void ablk_init_common(struct crypto_tfm *tfm, - struct cryptd_ablkcipher *cryptd_tfm) +static int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) { struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + crypto_ablkcipher_reqsize(&cryptd_tfm->base); + + return 0; } static int ablk_ecb_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-aes-aesni", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); } static int ablk_cbc_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-aes-aesni", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return ablk_init_common(tfm, "__driver-cbc-aes-aesni"); } #ifdef CONFIG_X86_64 static int ablk_ctr_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); } #ifdef HAS_CTR static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher( - "rfc3686(__driver-ctr-aes-aesni)", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return ablk_init_common(tfm, "rfc3686(__driver-ctr-aes-aesni)"); } #endif #endif @@ -504,42 +485,21 @@ static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) #ifdef HAS_LRW static int ablk_lrw_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))", - 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return ablk_init_common(tfm, "fpu(lrw(__driver-aes-aesni))"); } #endif #ifdef HAS_PCBC static int ablk_pcbc_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))", - 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); } #endif #ifdef HAS_XTS static int ablk_xts_init(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))", - 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return ablk_init_common(tfm, "fpu(xts(__driver-aes-aesni))"); } #endif -- cgit v1.2.3 From a7c1938e22c02b008655524c766d185ae99d9d53 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 9 Feb 2012 09:10:30 -0800 Subject: userns: Convert stat to return values mapped from kuids and kgids - Store uids and gids with kuid_t and kgid_t in struct kstat - Convert uid and gids to userspace usable values with from_kuid and from_kgid Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- arch/arm/kernel/sys_oabi-compat.c | 4 ++-- arch/parisc/hpux/fs.c | 4 ++-- arch/s390/kernel/compat_linux.c | 4 ++-- arch/sparc/kernel/sys_sparc32.c | 4 ++-- arch/x86/ia32/sys_ia32.c | 4 ++-- fs/compat.c | 4 ++-- fs/stat.c | 12 ++++++------ include/linux/stat.h | 5 +++-- 8 files changed, 21 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index af0aaebf4de6..3e94811690ce 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -124,8 +124,8 @@ static long cp_oldabi_stat64(struct kstat *stat, tmp.__st_ino = stat->ino; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; - tmp.st_uid = stat->uid; - tmp.st_gid = stat->gid; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_rdev = huge_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_blocks = stat->blocks; diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c index 0dc8543acb4f..c71eb6c79897 100644 --- a/arch/parisc/hpux/fs.c +++ b/arch/parisc/hpux/fs.c @@ -159,8 +159,8 @@ static int cp_hpux_stat(struct kstat *stat, struct hpux_stat64 __user *statbuf) tmp.st_ino = stat->ino; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; - tmp.st_uid = stat->uid; - tmp.st_gid = stat->gid; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_rdev = new_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_atime = stat->atime.tv_sec; diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index f0273ed760ef..65426525d9f2 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -547,8 +547,8 @@ static int cp_stat64(struct stat64_emu31 __user *ubuf, struct kstat *stat) tmp.__st_ino = (u32)stat->ino; tmp.st_mode = stat->mode; tmp.st_nlink = (unsigned int)stat->nlink; - tmp.st_uid = stat->uid; - tmp.st_gid = stat->gid; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_rdev = huge_encode_dev(stat->rdev); tmp.st_size = stat->size; tmp.st_blksize = (u32)stat->blksize; diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index 29c478ffed91..f7392336961f 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -139,8 +139,8 @@ static int cp_compat_stat64(struct kstat *stat, err |= put_user(stat->ino, &statbuf->st_ino); err |= put_user(stat->mode, &statbuf->st_mode); err |= put_user(stat->nlink, &statbuf->st_nlink); - err |= put_user(stat->uid, &statbuf->st_uid); - err |= put_user(stat->gid, &statbuf->st_gid); + err |= put_user(from_kuid_munged(current_user_ns(), stat->uid), &statbuf->st_uid); + err |= put_user(from_kgid_munged(current_user_ns(), stat->gid), &statbuf->st_gid); err |= put_user(huge_encode_dev(stat->rdev), &statbuf->st_rdev); err |= put_user(0, (unsigned long __user *) &statbuf->__pad3[0]); err |= put_user(stat->size, &statbuf->st_size); diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index aec2202a596c..d5c820a54590 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -71,8 +71,8 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) { typeof(ubuf->st_uid) uid = 0; typeof(ubuf->st_gid) gid = 0; - SET_UID(uid, stat->uid); - SET_GID(gid, stat->gid); + SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || __put_user(stat->ino, &ubuf->__st_ino) || diff --git a/fs/compat.c b/fs/compat.c index f2944ace7a7b..0781e619a62a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -144,8 +144,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); if ((u64) stat->size > MAX_NON_LFS) return -EOVERFLOW; diff --git a/fs/stat.c b/fs/stat.c index c733dc5753ae..31acca5f5a0c 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -137,8 +137,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); tmp.st_rdev = old_encode_dev(stat->rdev); #if BITS_PER_LONG == 32 if (stat->size > MAX_NON_LFS) @@ -215,8 +215,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - SET_UID(tmp.st_uid, stat->uid); - SET_GID(tmp.st_gid, stat->gid); + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); #if BITS_PER_LONG == 32 tmp.st_rdev = old_encode_dev(stat->rdev); #else @@ -350,8 +350,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) #endif tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; - tmp.st_uid = stat->uid; - tmp.st_gid = stat->gid; + tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid); + tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid); tmp.st_atime = stat->atime.tv_sec; tmp.st_atime_nsec = stat->atime.tv_nsec; tmp.st_mtime = stat->mtime.tv_sec; diff --git a/include/linux/stat.h b/include/linux/stat.h index 611c398dab72..46132409a3f7 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -58,14 +58,15 @@ #include #include +#include struct kstat { u64 ino; dev_t dev; umode_t mode; unsigned int nlink; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; dev_t rdev; loff_t size; struct timespec atime; -- cgit v1.2.3 From 5abe68e493e5aea1ccfc384092f8e98a542b336a Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Sun, 6 May 2012 11:55:08 -0600 Subject: x86: kernel/check.c simple_strtoul cleanup Change set_corruption_check() and set_corruption_check_period() in kernel/check.c to call kstrtoul() instead of calling obsoleted simple_strtoul(). Signed-off-by: Shuah Khan Link: http://lkml.kernel.org/r/1336326908.2897.12.camel@lorien2 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/check.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 5da1269e8ddc..e2dbcb7dabdd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -27,21 +27,29 @@ static int num_scan_areas; static __init int set_corruption_check(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - memory_corruption_check = simple_strtol(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + memory_corruption_check = val; + return 0; } early_param("memory_corruption_check", set_corruption_check); static __init int set_corruption_check_period(char *arg) { - char *end; + ssize_t ret; + unsigned long val; - corruption_check_period = simple_strtoul(arg, &end, 10); + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; - return (*end == 0) ? 0 : -EINVAL; + corruption_check_period = val; + return 0; } early_param("memory_corruption_check_period", set_corruption_check_period); -- cgit v1.2.3 From 363f7ce3250aafdaab43011c7dc40158ea571e6b Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Sun, 6 May 2012 11:58:04 -0600 Subject: x86: kernel/dumpstack.c simple_strtoul cleanup Change kstack_setup() and code_bytes_setup() in kernel/dumpstack.c to call kstrtoul() instead of calling obsoleted simple_strtoul(). Signed-off-by: Shuah Khan Link: http://lkml.kernel.org/r/1336327084.2897.15.camel@lorien2 Signed-off-by: H. Peter Anvin --- arch/x86/kernel/dumpstack.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1b81839b6c88..b154f6d99058 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err) static int __init kstack_setup(char *s) { + ssize_t ret; + unsigned long val; + if (!s) return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + kstack_depth_to_print = val; return 0; } early_param("kstack", kstack_setup); static int __init code_bytes_setup(char *s) { - code_bytes = simple_strtoul(s, NULL, 0); + ssize_t ret; + unsigned long val; + + if (!s) + return -EINVAL; + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + + code_bytes = val; if (code_bytes > 8192) code_bytes = 8192; -- cgit v1.2.3 From 867aae6ebe593db73fb8a676475ee20227292cfe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 15 May 2012 17:01:09 -0600 Subject: x86/PCI: only check for spinlock being held in SMP kernels spin_is_locked() is always false on UP kernels: spin_lock_irqsave() does no locking, so we can't tell whether the lock is held or not. Therefore, this warning is only valid for SMP kernels. CC: Myron Stowe Signed-off-by: Bjorn Helgaas --- arch/x86/pci/i386.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 831971e731f7..dd8ca6f7223b 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -57,7 +57,7 @@ static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev) { struct pcibios_fwaddrmap *map; - WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock)); + WARN_ON_SMP(!spin_is_locked(&pcibios_fwaddrmap_lock)); list_for_each_entry(map, &pcibios_fwaddrmappings, list) if (map->dev == dev) -- cgit v1.2.3 From 6cf20beec4b91c240cf759b4db72669e251f1fc4 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 14 May 2012 17:00:40 +0100 Subject: x86/vga: set the default device from the fixup. Since Matthew's efi/vga changes on non-EFI machines we were failing to tell the vgaarb/switcheroo what the default device was, this sets the default device in the quirk if none has been set before. This fixes the switcheroo on my T410s. Cc: Matthew Garrett Acked-by: H. Peter Anvin Signed-off-by: Dave Airlie --- arch/x86/pci/fixup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index d0e6e403b4f6..01635537d72e 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -7,6 +7,7 @@ #include #include #include +#include static void __devinit pci_fixup_i450nx(struct pci_dev *d) { @@ -348,6 +349,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev) if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); + if (!vga_default_device()) + vga_set_default_device(pdev); } } DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, -- cgit v1.2.3 From 512d5649e8dc3ed36f2ebf0818da64a4d4c2544a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 13 May 2012 19:53:23 +0300 Subject: KVM: VMX: Fix %ds/%es clobber The vmx exit code unconditionally restores %ds and %es to __USER_DS. This can override the user's values, since %ds and %es are not saved and restored in x86_64 syscalls. In practice, this isn't dangerous since nobody uses segment registers in long mode, least of all programs that use KVM. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3062ea95266e..f2ee016e1004 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6102,7 +6102,10 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u16 _ds, _es; + savesegment(ds, _ds); + savesegment(es, _es); if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (vmcs12->idt_vectoring_info_field & @@ -6263,7 +6266,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) } } - asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); + loadsegment(ds, _ds); + loadsegment(es, _es); vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); -- cgit v1.2.3 From b2da15ac26a0c00fc0d399a2bc5cf3c4e15f0b4f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 13 May 2012 19:53:24 +0300 Subject: KVM: VMX: Optimize %ds, %es reload On x86_64, we can defer %ds and %es reload to the heavyweight context switch, since nothing in the lightweight paths uses the host %ds or %es (they are ignored by the processor). Furthermore we can avoid the load if the segments are null, by letting the hardware load the null segments for us. This is the expected case. On i386, we could avoid the reload entirely, since the entry.S paths take care of reload, except for the SYSEXIT path which leaves %ds and %es set to __USER_DS. So we set them to the same values as well. Saves about 70 cycles out of 1600 (around 4%; noisy measurements). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f2ee016e1004..32eb58866292 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -393,6 +393,9 @@ struct vcpu_vmx { struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif int gs_ldt_reload_needed; int fs_reload_needed; } host_state; @@ -1417,6 +1420,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->host_state.gs_ldt_reload_needed = 1; } +#ifdef CONFIG_X86_64 + savesegment(ds, vmx->host_state.ds_sel); + savesegment(es, vmx->host_state.es_sel); +#endif + #ifdef CONFIG_X86_64 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); @@ -1457,6 +1465,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) } if (vmx->host_state.fs_reload_needed) loadsegment(fs, vmx->host_state.fs_sel); +#ifdef CONFIG_X86_64 + if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { + loadsegment(ds, vmx->host_state.ds_sel); + loadsegment(es, vmx->host_state.es_sel); + } +#else + /* + * The sysexit path does not restore ds/es, so we must set them to + * a reasonable value ourselves. + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif reload_tss(); #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); @@ -3640,8 +3661,18 @@ static void vmx_set_constant_host_state(void) vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ +#ifdef CONFIG_X86_64 + /* + * Load null selectors, so we can avoid reloading them in + * __vmx_load_host_state(), in case userspace uses the null selectors + * too (the expected case). + */ + vmcs_write16(HOST_DS_SELECTOR, 0); + vmcs_write16(HOST_ES_SELECTOR, 0); +#else vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ +#endif vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ @@ -6102,10 +6133,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u16 _ds, _es; - savesegment(ds, _ds); - savesegment(es, _es); if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); if (vmcs12->idt_vectoring_info_field & @@ -6266,8 +6294,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) } } - loadsegment(ds, _ds); - loadsegment(es, _es); vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); -- cgit v1.2.3 From c142786c6291189b5c85f53d91743e1eefbd8fe0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 May 2012 15:44:06 +0300 Subject: KVM: MMU: Don't use RCU for lockless shadow walking Using RCU for lockless shadow walking can increase the amount of memory in use by the system, since RCU grace periods are unpredictable. We also have an unconditional write to a shared variable (reader_counter), which isn't good for scaling. Replace that with a scheme similar to x86's get_user_pages_fast(): disable interrupts during lockless shadow walk to force the freer (kvm_mmu_commit_zap_page()) to wait for the TLB flush IPI to find the processor with interrupts enabled. We also add a new vcpu->mode, READING_SHADOW_PAGE_TABLES, to prevent kvm_flush_remote_tlbs() from avoiding the IPI. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 4 --- arch/x86/kvm/mmu.c | 73 ++++++++++++++++------------------------- include/linux/kvm_host.h | 3 +- 3 files changed, 31 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69e39bc7e36f..64c8989263f6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -240,8 +240,6 @@ struct kvm_mmu_page { #endif int write_flooding_count; - - struct rcu_head rcu; }; struct kvm_pio_request { @@ -540,8 +538,6 @@ struct kvm_arch { u64 hv_guest_os_id; u64 hv_hypercall; - atomic_t reader_counter; - #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 07424cf60434..72102e0ab7cb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -551,19 +551,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - rcu_read_lock(); - atomic_inc(&vcpu->kvm->arch.reader_counter); - - /* Increase the counter before walking shadow page table */ - smp_mb__after_atomic_inc(); + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); + vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_mb(); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* Decrease the counter after walking shadow page table finished */ - smp_mb__before_atomic_dec(); - atomic_dec(&vcpu->kvm->arch.reader_counter); - rcu_read_unlock(); + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_mb(); + vcpu->mode = OUTSIDE_GUEST_MODE; + local_irq_enable(); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, @@ -1989,30 +1999,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, return ret; } -static void kvm_mmu_isolate_pages(struct list_head *invalid_list) -{ - struct kvm_mmu_page *sp; - - list_for_each_entry(sp, invalid_list, link) - kvm_mmu_isolate_page(sp); -} - -static void free_pages_rcu(struct rcu_head *head) -{ - struct kvm_mmu_page *next, *sp; - - sp = container_of(head, struct kvm_mmu_page, rcu); - while (sp) { - if (!list_empty(&sp->link)) - next = list_first_entry(&sp->link, - struct kvm_mmu_page, link); - else - next = NULL; - kvm_mmu_free_page(sp); - sp = next; - } -} - static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { @@ -2021,17 +2007,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if (list_empty(invalid_list)) return; - kvm_flush_remote_tlbs(kvm); - - if (atomic_read(&kvm->arch.reader_counter)) { - kvm_mmu_isolate_pages(invalid_list); - sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); - list_del_init(invalid_list); + /* + * wmb: make sure everyone sees our modifications to the page tables + * rmb: make sure we see changes to vcpu->mode + */ + smp_mb(); - trace_kvm_mmu_delay_free_pages(sp); - call_rcu(&sp->rcu, free_pages_rcu); - return; - } + /* + * Wait for all vcpus to exit guest mode and/or lockless shadow + * page table walks. + */ + kvm_flush_remote_tlbs(kvm); do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); @@ -2039,7 +2025,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, kvm_mmu_isolate_page(sp); kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); - } /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cae342d29d1b..c4464356b35b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -128,7 +128,8 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); enum { OUTSIDE_GUEST_MODE, IN_GUEST_MODE, - EXITING_GUEST_MODE + EXITING_GUEST_MODE, + READING_SHADOW_PAGE_TABLES, }; /* -- cgit v1.2.3 From 796038799a72adb279d785c9154df6eeb98b6e8d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 13:22:41 -0700 Subject: x86, realmode: Mask out EFER.LMA when saving trampoline EFER Some AMD processors apparently #GP(0) if EFER.LMA is set in WRMSR, rather than ignoring it. Thus, we need to mask it out. Reported-by: Ingo Molnar Tested-by: Borislav Petkov Cc: Jarkko Sakkinen Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1336501366-28617-24-git-send-email-jarkko.sakkinen@intel.com --- arch/x86/kernel/realmode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c index 66ac276cf361..099277984b80 100644 --- a/arch/x86/kernel/realmode.c +++ b/arch/x86/kernel/realmode.c @@ -22,6 +22,7 @@ void __init setup_real_mode(void) size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); #ifdef CONFIG_X86_64 u64 *trampoline_pgd; + u32 efer_low, efer_high; #endif /* Has to be in very low memory so we can execute real-mode AP code. */ @@ -65,9 +66,13 @@ void __init setup_real_mode(void) trampoline_header->gdt_limit = __BOOT_DS + 7; trampoline_header->gdt_base = __pa(boot_gdt); #else - if (rdmsr_safe(MSR_EFER, &trampoline_header->efer_low, - &trampoline_header->efer_high)) - BUG(); + /* + * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR + * so we need to mask it out. + */ + rdmsr(MSR_EFER, efer_low, efer_high); + trampoline_header->efer_low = efer_low & ~EFER_LMA; + trampoline_header->efer_high = efer_high; trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; -- cgit v1.2.3 From 51edbe6a2f47c78c6c6e529999ee0a044fe59a89 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 13:44:10 -0700 Subject: x86, realmode: Move not-common bits out of trampoline_common.S Move the bits that aren't actually common out of trampoline_common.S and into the arch-specific files. Furthermore, make sure the page directory is first in the .bss section for trampoline_64.S in order to not waste an entire page of memory. Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/realmode/rm/trampoline_32.S | 8 ++++++++ arch/x86/realmode/rm/trampoline_64.S | 25 +++++++++++++++++++++++ arch/x86/realmode/rm/trampoline_common.S | 35 -------------------------------- 3 files changed, 33 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/trampoline_32.S b/arch/x86/realmode/rm/trampoline_32.S index 6fc064b4d2b9..c1b2791183e7 100644 --- a/arch/x86/realmode/rm/trampoline_32.S +++ b/arch/x86/realmode/rm/trampoline_32.S @@ -63,4 +63,12 @@ ENTRY(trampoline_start) ENTRY(startup_32) # note: also used from wakeup_asm.S jmp *%eax + .bss + .balign 8 +GLOBAL(trampoline_header) + tr_start: .space 4 + tr_gdt_pad: .space 2 + tr_gdt: .space 6 +END(trampoline_header) + #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 66e26f088288..1b9e1bc1ac5e 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -125,4 +125,29 @@ ENTRY(startup_64) # Now jump into the kernel using virtual addresses jmpq *tr_start(%rip) + .section ".rodata","a" + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 16 + .globl tr_gdt +tr_gdt: + .short tr_gdt_end - tr_gdt - 1 # gdt limit + .long pa_tr_gdt + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tr_gdt_end: + + .bss + .balign PAGE_SIZE +GLOBAL(trampoline_pgd) .space PAGE_SIZE + + .balign 8 +GLOBAL(trampoline_header) + tr_start: .space 8 + GLOBAL(tr_cr4) .space 4 + GLOBAL(tr_efer) .space 8 +END(trampoline_header) + #include "trampoline_common.S" diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index cac444b942f8..b1ecdb9692ad 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,42 +1,7 @@ .section ".rodata","a" - -#ifdef CONFIG_X86_64 - # Duplicate the global descriptor table - # so the kernel can live anywhere .balign 16 - .globl tr_gdt -tr_gdt: - .short tr_gdt_end - tr_gdt - 1 # gdt limit - .long pa_tr_gdt - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tr_gdt_end: -#endif - - .balign 4 tr_idt: .fill 1, 6, 0 .bss - .balign 4 GLOBAL(trampoline_status) .space 4 - - .balign 8 -GLOBAL(trampoline_header) -#ifdef CONFIG_X86_32 - tr_start: .space 4 - tr_gdt_pad: .space 2 - tr_gdt: .space 6 -#else - tr_start: .space 8 - GLOBAL(tr_cr4) .space 4 - GLOBAL(tr_efer) .space 8 -#endif -END(trampoline_header) - -#ifdef CONFIG_X86_64 - .balign PAGE_SIZE -GLOBAL(trampoline_pgd) .space PAGE_SIZE -#endif -- cgit v1.2.3 From 137127018812ec7fcccb9843156cfc0b5cfa31d5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 13:49:10 -0700 Subject: x86, realmode: Move kernel/realmode.c to realmode/init.c Keep all the realmode code together, including initialization (only the rm/ subdirectory is actually built as real-mode code, anyway.) Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/realmode.c | 116 --------------------------------------------- arch/x86/realmode/Makefile | 1 + arch/x86/realmode/init.c | 116 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 117 deletions(-) delete mode 100644 arch/x86/kernel/realmode.c create mode 100644 arch/x86/realmode/init.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4a20f4441ffe..08484332f329 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-y += realmode.o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o diff --git a/arch/x86/kernel/realmode.c b/arch/x86/kernel/realmode.c deleted file mode 100644 index 099277984b80..000000000000 --- a/arch/x86/kernel/realmode.c +++ /dev/null @@ -1,116 +0,0 @@ -#include -#include - -#include -#include -#include - -struct real_mode_header *real_mode_header; -u32 *trampoline_cr4_features; - -void __init setup_real_mode(void) -{ - phys_addr_t mem; - u16 real_mode_seg; - u32 *rel; - u32 count; - u32 *ptr; - u16 *seg; - int i; - unsigned char *base; - struct trampoline_header *trampoline_header; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); -#ifdef CONFIG_X86_64 - u64 *trampoline_pgd; - u32 efer_low, efer_high; -#endif - - /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); - - base = __va(mem); - memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); - - memcpy(base, real_mode_blob, size); - - real_mode_seg = __pa(base) >> 4; - rel = (u32 *) real_mode_relocs; - - /* 16-bit segment relocations. */ - count = rel[0]; - rel = &rel[1]; - for (i = 0; i < count; i++) { - seg = (u16 *) (base + rel[i]); - *seg = real_mode_seg; - } - - /* 32-bit linear relocations. */ - count = rel[i]; - rel = &rel[i + 1]; - for (i = 0; i < count; i++) { - ptr = (u32 *) (base + rel[i]); - *ptr += __pa(base); - } - - /* Must be perfomed *after* relocation. */ - trampoline_header = (struct trampoline_header *) - __va(real_mode_header->trampoline_header); - -#ifdef CONFIG_X86_32 - trampoline_header->start = __pa(startup_32_smp); - trampoline_header->gdt_limit = __BOOT_DS + 7; - trampoline_header->gdt_base = __pa(boot_gdt); -#else - /* - * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR - * so we need to mask it out. - */ - rdmsr(MSR_EFER, efer_low, efer_high); - trampoline_header->efer_low = efer_low & ~EFER_LMA; - trampoline_header->efer_high = efer_high; - - trampoline_header->start = (u64) secondary_startup_64; - trampoline_cr4_features = &trampoline_header->cr4; - *trampoline_cr4_features = read_cr4(); - - trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; - trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; -#endif -} - -/* - * set_real_mode_permissions() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page - * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. - */ -static int __init set_real_mode_permissions(void) -{ - unsigned char *base = (unsigned char *) real_mode_header; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); - - size_t ro_size = - PAGE_ALIGN(real_mode_header->ro_end) - - __pa(base); - - size_t text_size = - PAGE_ALIGN(real_mode_header->ro_end) - - real_mode_header->text_start; - - unsigned long text_start = - (unsigned long) __va(real_mode_header->text_start); - - set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); - set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); - set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); - - return 0; -} - -arch_initcall(set_real_mode_permissions); diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile index a05b3aca64ad..94f7fbe97b08 100644 --- a/arch/x86/realmode/Makefile +++ b/arch/x86/realmode/Makefile @@ -9,6 +9,7 @@ subdir- := rm +obj-y += init.o obj-y += rmpiggy.o $(obj)/rmpiggy.o: $(obj)/rm/realmode.bin diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c new file mode 100644 index 000000000000..099277984b80 --- /dev/null +++ b/arch/x86/realmode/init.c @@ -0,0 +1,116 @@ +#include +#include + +#include +#include +#include + +struct real_mode_header *real_mode_header; +u32 *trampoline_cr4_features; + +void __init setup_real_mode(void) +{ + phys_addr_t mem; + u16 real_mode_seg; + u32 *rel; + u32 count; + u32 *ptr; + u16 *seg; + int i; + unsigned char *base; + struct trampoline_header *trampoline_header; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); +#ifdef CONFIG_X86_64 + u64 *trampoline_pgd; + u32 efer_low, efer_high; +#endif + + /* Has to be in very low memory so we can execute real-mode AP code. */ + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); + if (!mem) + panic("Cannot allocate trampoline\n"); + + base = __va(mem); + memblock_reserve(mem, size); + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); + + memcpy(base, real_mode_blob, size); + + real_mode_seg = __pa(base) >> 4; + rel = (u32 *) real_mode_relocs; + + /* 16-bit segment relocations. */ + count = rel[0]; + rel = &rel[1]; + for (i = 0; i < count; i++) { + seg = (u16 *) (base + rel[i]); + *seg = real_mode_seg; + } + + /* 32-bit linear relocations. */ + count = rel[i]; + rel = &rel[i + 1]; + for (i = 0; i < count; i++) { + ptr = (u32 *) (base + rel[i]); + *ptr += __pa(base); + } + + /* Must be perfomed *after* relocation. */ + trampoline_header = (struct trampoline_header *) + __va(real_mode_header->trampoline_header); + +#ifdef CONFIG_X86_32 + trampoline_header->start = __pa(startup_32_smp); + trampoline_header->gdt_limit = __BOOT_DS + 7; + trampoline_header->gdt_base = __pa(boot_gdt); +#else + /* + * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR + * so we need to mask it out. + */ + rdmsr(MSR_EFER, efer_low, efer_high); + trampoline_header->efer_low = efer_low & ~EFER_LMA; + trampoline_header->efer_high = efer_high; + + trampoline_header->start = (u64) secondary_startup_64; + trampoline_cr4_features = &trampoline_header->cr4; + *trampoline_cr4_features = read_cr4(); + + trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); + trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; + trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; +#endif +} + +/* + * set_real_mode_permissions() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. + */ +static int __init set_real_mode_permissions(void) +{ + unsigned char *base = (unsigned char *) real_mode_header; + size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + + size_t ro_size = + PAGE_ALIGN(real_mode_header->ro_end) - + __pa(base); + + size_t text_size = + PAGE_ALIGN(real_mode_header->ro_end) - + real_mode_header->text_start; + + unsigned long text_start = + (unsigned long) __va(real_mode_header->text_start); + + set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); + set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); + set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); + + return 0; +} + +arch_initcall(set_real_mode_permissions); -- cgit v1.2.3 From ab7b64e9ee1e930ffe9d7f5b5eebe618a3b3a03b Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Wed, 16 May 2012 13:43:26 -0400 Subject: x86: Don't continue booting if we can't load the specified initrd If we've determined we can't do what the user asked, trying to do something else isn't going to make the user's life better. Without this the screen scrolls a bit and then you get a panic anyway, and it's nice not to have so much scroll after the real problem in bug reports. Link: http://lkml.kernel.org/r/1337190206-12121-1-git-send-email-pjones@redhat.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..37ef1169ffde 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -393,10 +393,9 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); - printk(KERN_ERR "initrd too large to handle, " - "disabling initrd\n"); - return; + panic("initrd too large to handle, " + "disabling initrd (%lld needed, %lld available)\n", + ramdisk_size, end_of_lowmem>>1); } printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, -- cgit v1.2.3 From 638d957b51c88852de72f15f7cd588d125e97dab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 16 May 2012 14:02:05 -0700 Subject: x86, realmode: Change EFER to a single u64 field Change EFER to be a single u64 field instead of two u32 fields; change the order to maintain alignment. Note that on x86-64 cr4 is really also a 64-bit quantity, although we can only set the low 32 bits from the trampoline code since it is still executing in 32-bit mode at that point. Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/include/asm/realmode.h | 3 +-- arch/x86/realmode/init.c | 7 +++---- arch/x86/realmode/rm/trampoline_64.S | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 937dc6071d76..fce3f4ae5bd6 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -35,9 +35,8 @@ struct trampoline_header { u32 gdt_base; #else u64 start; + u64 efer; u32 cr4; - u32 efer_low; - u32 efer_high; #endif }; diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 099277984b80..cbca565af5bd 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -22,7 +22,7 @@ void __init setup_real_mode(void) size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); #ifdef CONFIG_X86_64 u64 *trampoline_pgd; - u32 efer_low, efer_high; + u64 efer; #endif /* Has to be in very low memory so we can execute real-mode AP code. */ @@ -70,9 +70,8 @@ void __init setup_real_mode(void) * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR * so we need to mask it out. */ - rdmsr(MSR_EFER, efer_low, efer_high); - trampoline_header->efer_low = efer_low & ~EFER_LMA; - trampoline_header->efer_high = efer_high; + rdmsrl(MSR_EFER, efer); + trampoline_header->efer = efer & ~EFER_LMA; trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index 1b9e1bc1ac5e..bb360dc39d21 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -146,8 +146,8 @@ GLOBAL(trampoline_pgd) .space PAGE_SIZE .balign 8 GLOBAL(trampoline_header) tr_start: .space 8 - GLOBAL(tr_cr4) .space 4 GLOBAL(tr_efer) .space 8 + GLOBAL(tr_cr4) .space 4 END(trampoline_header) #include "trampoline_common.S" -- cgit v1.2.3 From d8368af8b46b904def42a0f341d2f4f29001fa77 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 May 2012 18:07:56 +0300 Subject: KVM: Fix mmu_reload() clash with nested vmx event injection Currently the inject_pending_event() call during guest entry happens after kvm_mmu_reload(). This is for historical reasons - we used to inject_pending_event() in atomic context, while kvm_mmu_reload() needs task context. A problem is that nested vmx can cause the mmu context to be reset, if event injection is intercepted and causes a #VMEXIT instead (the #VMEXIT resets CR0/CR3/CR4). If this happens, we end up with invalid root_hpa, and since kvm_mmu_reload() has already run, no one will fix it and we end up entering the guest this way. Fix by reordering event injection to be before kvm_mmu_reload(). Use ->cancel_injection() to undo if kvm_mmu_reload() fails. https://bugzilla.kernel.org/show_bug.cgi?id=42980 Reported-by: Luke-Jr Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4de705cdcafd..b78f89d34242 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5279,10 +5279,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_deliver_pmi(vcpu); } - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); @@ -5298,6 +5294,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) { + kvm_x86_ops->cancel_injection(vcpu); + goto out; + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); -- cgit v1.2.3 From 55ccf3fe3f9a3441731aa79cf42a628fc4ecace9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 16 May 2012 15:03:51 -0700 Subject: fork: move the real prepare_to_copy() users to arch_dup_task_struct() Historical prepare_to_copy() is mostly a no-op, duplicated for majority of the architectures and the rest following the x86 model of flushing the extended register state like fpu there. Remove it and use the arch_dup_task_struct() instead. Suggested-by: Oleg Nesterov Suggested-by: Linus Torvalds Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1336692811-30576-1-git-send-email-suresh.b.siddha@intel.com Acked-by: Benjamin Herrenschmidt Cc: David Howells Cc: Koichi Yasutake Cc: Paul Mackerras Cc: Paul Mundt Cc: Chris Zankel Cc: Richard Henderson Cc: Russell King Cc: Haavard Skinnemoen Cc: Mike Frysinger Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Mikael Starvik Cc: Yoshinori Sato Cc: Richard Kuo Cc: Tony Luck Cc: Michal Simek Cc: Ralf Baechle Cc: Jonas Bonn Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Chen Liqin Cc: Lennox Wu Cc: David S. Miller Cc: Chris Metcalf Cc: Jeff Dike Cc: Richard Weinberger Cc: Guan Xuetao Signed-off-by: H. Peter Anvin --- arch/alpha/include/asm/processor.h | 3 --- arch/arm/include/asm/processor.h | 3 --- arch/avr32/include/asm/processor.h | 3 --- arch/blackfin/include/asm/processor.h | 2 -- arch/c6x/include/asm/processor.h | 3 --- arch/cris/include/asm/processor.h | 4 ---- arch/frv/include/asm/processor.h | 2 -- arch/frv/kernel/process.c | 11 ----------- arch/h8300/include/asm/processor.h | 2 -- arch/hexagon/include/asm/processor.h | 7 ------- arch/ia64/include/asm/processor.h | 3 --- arch/m32r/include/asm/processor.h | 2 -- arch/m68k/include/asm/processor.h | 3 --- arch/microblaze/include/asm/processor.h | 1 - arch/mips/include/asm/processor.h | 3 --- arch/mn10300/include/asm/processor.h | 3 --- arch/mn10300/kernel/process.c | 10 ++++++---- arch/openrisc/include/asm/processor.h | 4 ---- arch/parisc/include/asm/processor.h | 3 --- arch/powerpc/include/asm/processor.h | 3 --- arch/powerpc/kernel/process.c | 19 +++++++++++-------- arch/s390/include/asm/processor.h | 3 --- arch/score/include/asm/processor.h | 1 - arch/sh/include/asm/processor_32.h | 3 --- arch/sh/include/asm/processor_64.h | 1 - arch/sh/kernel/process.c | 7 +++++++ arch/sh/kernel/process_32.c | 9 --------- arch/sparc/include/asm/processor_32.h | 3 --- arch/sparc/include/asm/processor_64.h | 3 --- arch/tile/include/asm/processor.h | 3 --- arch/um/include/asm/processor-generic.h | 5 ----- arch/unicore32/include/asm/processor.h | 3 --- arch/x86/include/asm/processor.h | 3 --- arch/x86/kernel/process.c | 6 ++++++ arch/x86/kernel/process_32.c | 9 --------- arch/x86/kernel/process_64.c | 9 --------- arch/xtensa/include/asm/processor.h | 3 --- arch/xtensa/kernel/process.c | 9 ++++++--- kernel/fork.c | 2 -- 39 files changed, 36 insertions(+), 140 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 94afe5859301..e37b887b3d9f 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -49,9 +49,6 @@ extern void start_thread(struct pt_regs *, unsigned long, unsigned long); /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - /* Create a kernel thread without removing it from tasklists. */ extern long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 5ac8d3d3e025..6354224f980d 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -77,9 +77,6 @@ struct task_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - unsigned long get_wchan(struct task_struct *p); #if __LINUX_ARM_ARCH__ == 6 || defined(CONFIG_ARM_ERRATA_754327) diff --git a/arch/avr32/include/asm/processor.h b/arch/avr32/include/asm/processor.h index 108502bc6770..87d8baccc60e 100644 --- a/arch/avr32/include/asm/processor.h +++ b/arch/avr32/include/asm/processor.h @@ -145,9 +145,6 @@ extern void release_thread(struct task_struct *); /* Create a kernel thread without removing it from tasklists */ extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while(0) - /* Return saved PC of a blocked thread */ #define thread_saved_pc(tsk) ((tsk)->thread.cpu_context.pc) diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h index 8af7772e84cc..4ef7cfe43ceb 100644 --- a/arch/blackfin/include/asm/processor.h +++ b/arch/blackfin/include/asm/processor.h @@ -75,8 +75,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -#define prepare_to_copy(tsk) do { } while (0) - extern int kernel_thread(int (*fn) (void *), void *arg, unsigned long flags); /* diff --git a/arch/c6x/include/asm/processor.h b/arch/c6x/include/asm/processor.h index 3ff7fab956ba..c50af7ef1c96 100644 --- a/arch/c6x/include/asm/processor.h +++ b/arch/c6x/include/asm/processor.h @@ -92,9 +92,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); #define copy_segments(tsk, mm) do { } while (0) diff --git a/arch/cris/include/asm/processor.h b/arch/cris/include/asm/processor.h index 4210d72a6667..37f522feabc1 100644 --- a/arch/cris/include/asm/processor.h +++ b/arch/cris/include/asm/processor.h @@ -50,10 +50,6 @@ struct task_struct; #define task_pt_regs(task) user_regs(task_thread_info(task)) #define current_regs() task_pt_regs(current) -static inline void prepare_to_copy(struct task_struct *tsk) -{ -} - extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); unsigned long get_wchan(struct task_struct *p); diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index 81c2e271d620..9c768170b304 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -103,8 +103,6 @@ do { \ __frame->sp = (_usp); \ } while(0) -extern void prepare_to_copy(struct task_struct *tsk); - /* Free all resources held by a thread. */ static inline void release_thread(struct task_struct *dead_task) { diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index d4de48bd5efe..9f3dfadee09e 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -180,17 +180,6 @@ asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, return do_fork(clone_flags, newsp, __frame, 0, parent_tidptr, child_tidptr); } /* end sys_clone() */ -/*****************************************************************************/ -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - //unlazy_fpu(tsk); -} /* end prepare_to_copy() */ - -/*****************************************************************************/ /* * set up the kernel stack and exception frames for a new process */ diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h index 61fabf1788c6..4c9f6f87b617 100644 --- a/arch/h8300/include/asm/processor.h +++ b/arch/h8300/include/asm/processor.h @@ -109,8 +109,6 @@ static inline void release_thread(struct task_struct *dead_task) extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); -#define prepare_to_copy(tsk) do { } while (0) - /* * Free current thread data structures etc.. */ diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index 20c5ddabbd8b..e8ea459002a4 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -58,13 +58,6 @@ struct thread_struct { #define cpu_relax() __vmyield() -/* - * "Unlazying all lazy status" occurs here. - */ -static inline void prepare_to_copy(struct task_struct *tsk) -{ -} - /* * Decides where the kernel will search for a free chunk of vm space during * mmaps. diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index 483f6c6a4238..efcca1b601c5 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -343,9 +343,6 @@ struct task_struct; */ #define release_thread(dead_task) -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - /* * This is the mechanism for creating a new kernel thread. * diff --git a/arch/m32r/include/asm/processor.h b/arch/m32r/include/asm/processor.h index e1f46d757460..da17253b5735 100644 --- a/arch/m32r/include/asm/processor.h +++ b/arch/m32r/include/asm/processor.h @@ -118,8 +118,6 @@ struct mm_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -#define prepare_to_copy(tsk) do { } while (0) - /* * create a kernel thread without removing it from tasklists */ diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index 46460fa15d5c..f17c42aff7ff 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -153,9 +153,6 @@ static inline void release_thread(struct task_struct *dead_task) { } -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); /* diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index bffb54527299..af2bb9652392 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -23,7 +23,6 @@ extern const struct seq_operations cpuinfo_op; # define cpu_relax() barrier() # define cpu_sleep() do {} while (0) -# define prepare_to_copy(tsk) do {} while (0) #define task_pt_regs(tsk) \ (((struct pt_regs *)(THREAD_SIZE + task_stack_page(tsk))) - 1) diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 20e9dcf42b27..5e33fabe354d 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -310,9 +310,6 @@ struct task_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); extern unsigned long thread_saved_pc(struct task_struct *tsk); diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h index f7b3c9ab2cb5..247928c9f549 100644 --- a/arch/mn10300/include/asm/processor.h +++ b/arch/mn10300/include/asm/processor.h @@ -139,9 +139,6 @@ static inline void start_thread(struct pt_regs *regs, /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct *tsk); - /* * create a kernel thread without removing it from tasklists */ diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 14707f25153b..7dab0cd36466 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -208,12 +208,14 @@ void copy_segments(struct task_struct *p, struct mm_struct *new_mm) } /* - * this gets called before we allocate a new thread and copy the current task - * into it so that we can store lazy state into memory + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. */ -void prepare_to_copy(struct task_struct *tsk) +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - unlazy_fpu(tsk); + unlazy_fpu(src); + *dst = *src; + return 0; } /* diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index f7516fa78b58..30462f1fe959 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -72,10 +72,6 @@ struct thread_struct { #define task_pt_regs(task) user_regs(task_thread_info(task)) #define current_regs() user_regs(current_thread_info()) -extern inline void prepare_to_copy(struct task_struct *tsk) -{ -} - #define INIT_SP (sizeof(init_stack) + (unsigned long) &init_stack) #define INIT_THREAD { } diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index acdf4cad6125..0e8b7b8ce8a2 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -328,9 +328,6 @@ struct mm_struct; extern void release_thread(struct task_struct *); extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - extern void map_hpux_gateway_page(struct task_struct *tsk, struct mm_struct *mm); extern unsigned long get_wchan(struct task_struct *p); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 8e2d0371fe1e..854f899d0c34 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -74,9 +74,6 @@ struct task_struct; void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct *tsk); - /* Create a new kernel thread. */ extern long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4937c9690090..bc129f24e11f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -711,18 +711,21 @@ release_thread(struct task_struct *t) } /* - * This gets called before we allocate a new thread and copy - * the current task into it. + * this gets called so that we can store coprocessor state into memory and + * copy the current task into the new thread. */ -void prepare_to_copy(struct task_struct *tsk) +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - flush_fp_to_thread(current); - flush_altivec_to_thread(current); - flush_vsx_to_thread(current); - flush_spe_to_thread(current); + flush_fp_to_thread(src); + flush_altivec_to_thread(src); + flush_vsx_to_thread(src); + flush_spe_to_thread(src); #ifdef CONFIG_HAVE_HW_BREAKPOINT - flush_ptrace_hw_breakpoint(tsk); + flush_ptrace_hw_breakpoint(src); #endif /* CONFIG_HAVE_HW_BREAKPOINT */ + + *dst = *src; + return 0; } /* diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index d499b30ea487..6cbf31311673 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -141,9 +141,6 @@ struct seq_file; extern void release_thread(struct task_struct *); extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - /* * Return saved PC of a blocked thread. */ diff --git a/arch/score/include/asm/processor.h b/arch/score/include/asm/processor.h index 7e22f216d771..ab3aceb54209 100644 --- a/arch/score/include/asm/processor.h +++ b/arch/score/include/asm/processor.h @@ -26,7 +26,6 @@ extern unsigned long get_wchan(struct task_struct *p); #define cpu_relax() barrier() #define release_thread(thread) do {} while (0) -#define prepare_to_copy(tsk) do {} while (0) /* * User space process size: 2GB. This is hardcoded into a few places, diff --git a/arch/sh/include/asm/processor_32.h b/arch/sh/include/asm/processor_32.h index 900f8d72ffe2..b6311fd2d066 100644 --- a/arch/sh/include/asm/processor_32.h +++ b/arch/sh/include/asm/processor_32.h @@ -126,9 +126,6 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_pc, unsigned lo /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy status */ -void prepare_to_copy(struct task_struct *tsk); - /* * create a kernel thread without removing it from tasklists */ diff --git a/arch/sh/include/asm/processor_64.h b/arch/sh/include/asm/processor_64.h index e25c4c7d6b63..fe99afecfbc5 100644 --- a/arch/sh/include/asm/processor_64.h +++ b/arch/sh/include/asm/processor_64.h @@ -172,7 +172,6 @@ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); #define copy_segments(p, mm) do { } while (0) #define release_segments(mm) do { } while (0) #define forget_segments() do { } while (0) -#define prepare_to_copy(tsk) do { } while (0) /* * FPU lazy state save handling. */ diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index 325f98b1736d..2bde59eae10d 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -6,8 +6,15 @@ struct kmem_cache *task_xstate_cachep = NULL; unsigned int xstate_size; +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { +#ifdef CONFIG_SUPERH32 + unlazy_fpu(src, task_pt_regs(src)); +#endif *dst = *src; if (src->thread.xstate) { diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 94273aaf78c1..dee596113cc1 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -155,15 +155,6 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) } EXPORT_SYMBOL(dump_fpu); -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk, task_pt_regs(tsk)); -} - asmlinkage void ret_from_fork(void); int copy_thread(unsigned long clone_flags, unsigned long usp, diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index 09521c6a5edb..c9c760fdc713 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -109,9 +109,6 @@ static inline void start_thread(struct pt_regs * regs, unsigned long pc, #define release_thread(tsk) do { } while(0) extern pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - extern unsigned long get_wchan(struct task_struct *); #define task_pt_regs(tsk) ((tsk)->thread.kregs) diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index e713db249931..67df5cc10011 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -186,9 +186,6 @@ do { \ /* Free all resources held by a thread. */ #define release_thread(tsk) do { } while (0) -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - extern pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); extern unsigned long get_wchan(struct task_struct *task); diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h index 34c1e01ffb5e..15cd8a4a06ce 100644 --- a/arch/tile/include/asm/processor.h +++ b/arch/tile/include/asm/processor.h @@ -210,9 +210,6 @@ static inline void release_thread(struct task_struct *dead_task) /* Nothing for now */ } -/* Prepare to copy thread state - unlazy all lazy status. */ -#define prepare_to_copy(tsk) do { } while (0) - extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern int do_work_pending(struct pt_regs *regs, u32 flags); diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 98d01bc4fa92..63b716052d20 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -76,11 +76,6 @@ static inline void release_thread(struct task_struct *task) extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); -static inline void prepare_to_copy(struct task_struct *tsk) -{ -} - - extern unsigned long thread_saved_pc(struct task_struct *t); static inline void mm_copy_segments(struct mm_struct *from_mm, diff --git a/arch/unicore32/include/asm/processor.h b/arch/unicore32/include/asm/processor.h index f0d780a51f9b..14382cb09657 100644 --- a/arch/unicore32/include/asm/processor.h +++ b/arch/unicore32/include/asm/processor.h @@ -68,9 +68,6 @@ struct task_struct; /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy status */ -#define prepare_to_copy(tsk) do { } while (0) - unsigned long get_wchan(struct task_struct *p); #define cpu_relax() barrier() diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4fa7dcceb6c0..97fe04318e95 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -579,9 +579,6 @@ extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); -/* Prepare to copy thread state - unlazy all lazy state */ -extern void prepare_to_copy(struct task_struct *tsk); - unsigned long get_wchan(struct task_struct *p); /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 1d92a5ab6e8b..b7e1e0e53987 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -47,10 +47,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister); struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { int ret; + unlazy_fpu(src); + *dst = *src; if (fpu_allocated(&src->thread.fpu)) { memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ae6847303e26..2aa57dc909d6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task) release_vm86_irqs(dead_task); } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 43d8b48b23e6..c4c0645a4011 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) return get_desc_base(&t->thread.tls_array[tls]); } -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 3acb26e8dead..5c371d8d4528 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -168,9 +168,6 @@ struct mm_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) -/* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct*); - /* Create a kernel thread without removing it from tasklists */ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 6a2d6edf8f72..9b306e550e3f 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -140,13 +140,16 @@ void flush_thread(void) } /* - * This is called before the thread is copied. + * this gets called so that we can store coprocessor state into memory and + * copy the current task into the new thread. */ -void prepare_to_copy(struct task_struct *tsk) +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { #if XTENSA_HAVE_COPROCESSORS - coprocessor_flush_all(task_thread_info(tsk)); + coprocessor_flush_all(task_thread_info(src)); #endif + *dst = *src; + return 0; } /* diff --git a/kernel/fork.c b/kernel/fork.c index 687a15d56243..7aed746ff47c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -261,8 +261,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) int node = tsk_fork_get_node(orig); int err; - prepare_to_copy(orig); - tsk = alloc_task_struct_node(node); if (!tsk) return NULL; -- cgit v1.2.3 From d75f1b391f5ef73016d14bc6f7e4725820ebaa5b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 16 May 2012 15:03:53 -0700 Subject: x86, xsave: remove thread_has_fpu() bug check in __sanitize_i387_state() Code paths like fork(), exit() and signal handling flush the fpu state explicitly to the structures in memory. BUG_ON() in __sanitize_i387_state() is checking that the fpu state is not live any more. But for preempt kernels, task can be scheduled out and in at any place and the preload_fpu logic during context switch can make the fpu registers live again. For example, consider a 64-bit Task which uses fpu frequently and as such you will find its fpu_counter mostly non-zero. During its time slice, kernel used fpu by doing kernel_fpu_begin/kernel_fpu_end(). After this, in the same scheduling slice, task-A got a signal to handle. Then during the signal setup path we got preempted when we are just before the sanitize_i387_state() in arch/x86/kernel/xsave.c:save_i387_xstate(). And when we come back we will have the fpu registers live that can hit the bug_on. Similarly during core dump, other threads can context-switch in and out (because of spurious wakeups while waiting for the coredump to finish in kernel/exit.c:exit_mm()) and the main thread dumping core can run into this bug when it finds some other thread with its fpu registers live on some other cpu. So remove the paranoid check for now, even though it caught a bug in the multi-threaded core dump case (fixed in the previous patch). Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1336692811-30576-3-git-send-email-suresh.b.siddha@intel.com Cc: Oleg Nesterov Cc: Linus Torvalds Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index e62728e30b01..bd18149b2b0f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(__thread_has_fpu(tsk)); - xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; /* -- cgit v1.2.3 From 1dcc8d7ba235a316a056f993e88f0d18b92c60d9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 16 May 2012 15:03:54 -0700 Subject: x86, fpu: drop the fpu state during thread exit There is no need to save any active fpu state to the task structure memory if the task is dead. Just drop the state instead. For example, this saved some 1770 xsave's during the system boot of a two socket Xeon system. Suggested-by: Linus Torvalds Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1336692811-30576-4-git-send-email-suresh.b.siddha@intel.com Cc: Oleg Nesterov Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b7e1e0e53987..1219fe2be8f3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -87,6 +87,16 @@ void arch_task_cache_init(void) SLAB_PANIC | SLAB_NOTRACK, NULL); } +static inline void drop_fpu(struct task_struct *tsk) +{ + /* + * Forget coprocessor state.. + */ + tsk->fpu_counter = 0; + clear_fpu(tsk); + clear_used_math(); +} + /* * Free current thread data structures etc.. */ @@ -109,6 +119,8 @@ void exit_thread(void) put_cpu(); kfree(bp); } + + drop_fpu(me); } void show_regs(struct pt_regs *regs) @@ -149,12 +161,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); + drop_fpu(tsk); } static void hard_disable_TSC(void) -- cgit v1.2.3 From e4f5d5440bb860a3e8942ca8f7277a7f31798965 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Apr 2012 09:13:18 -0400 Subject: ftrace/x86: Have x86 ftrace use the ftrace_modify_all_code() To remove duplicate code, have the ftrace arch_ftrace_update_code() use the generic ftrace_modify_all_code(). This requires that the default ftrace_replace_code() becomes a weak function so that an arch may override it. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 15 ++------------- include/linux/ftrace.h | 1 + kernel/trace/ftrace.c | 4 ++-- 3 files changed, 5 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4243e8bbdcb1..32ff36596ab1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -435,7 +435,7 @@ static void run_sync(void) local_irq_disable(); } -static void ftrace_replace_code(int enable) +void ftrace_replace_code(int enable) { struct ftrace_rec_iter *iter; struct dyn_ftrace *rec; @@ -493,18 +493,7 @@ void arch_ftrace_update_code(int command) { modifying_ftrace_code++; - if (command & FTRACE_UPDATE_CALLS) - ftrace_replace_code(1); - else if (command & FTRACE_DISABLE_CALLS) - ftrace_replace_code(0); - - if (command & FTRACE_UPDATE_TRACE_FUNC) - ftrace_update_ftrace_func(ftrace_trace_function); - - if (command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); - else if (command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); + ftrace_modify_all_code(command); modifying_ftrace_code--; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index cd72ace7ade3..55e6d63d46d0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -314,6 +314,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void *data); +extern void ftrace_replace_code(int enable); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3c345825cc23..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1683,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return -1; /* unknow ftrace bug */ } -static void ftrace_replace_code(int update) +void __weak ftrace_replace_code(int enable) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1693,7 +1693,7 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - failed = __ftrace_replace_code(rec, update); + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ -- cgit v1.2.3 From db2e034d2c55e1f273ed820cc3edcdbc73d0292c Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 17 May 2012 08:31:29 +0100 Subject: x86/vga: fix build with efi disabled. Reported by sfr on -next merge. Signed-off-by: Dave Airlie --- arch/x86/pci/fixup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 01635537d72e..82487d3d5879 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -6,8 +6,8 @@ #include #include #include +#include #include -#include static void __devinit pci_fixup_i450nx(struct pci_dev *d) { -- cgit v1.2.3 From 8e7fbcbc22c12414bcc9dfdd683637f58fb32759 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Jan 2012 11:28:35 +0100 Subject: sched: Remove stale power aware scheduling remnants and dysfunctional knobs It's been broken forever (i.e. it's not scheduling in a power aware fashion), as reported by Suresh and others sending patches, and nobody cares enough to fix it properly ... so remove it to make space free for something better. There's various problems with the code as it stands today, first and foremost the user interface which is bound to topology levels and has multiple values per level. This results in a state explosion which the administrator or distro needs to master and almost nobody does. Furthermore large configuration state spaces aren't good, it means the thing doesn't just work right because it's either under so many impossibe to meet constraints, or even if there's an achievable state workloads have to be aware of it precisely and can never meet it for dynamic workloads. So pushing this kind of decision to user-space was a bad idea even with a single knob - it's exponentially worse with knobs on every node of the topology. There is a proposal to replace the user interface with a single 3 state knob: sched_balance_policy := { performance, power, auto } where 'auto' would be the preferred default which looks at things like Battery/AC mode and possible cpufreq state or whatever the hw exposes to show us power use expectations - but there's been no progress on it in the past many months. Aside from that, the actual implementation of the various knobs is known to be broken. There have been sporadic attempts at fixing things but these always stop short of reaching a mergable state. Therefore this wholesale removal with the hopes of spurring people who care to come forward once again and work on a coherent replacement. Signed-off-by: Peter Zijlstra Cc: Suresh Siddha Cc: Arjan van de Ven Cc: Vincent Guittot Cc: Vaidyanathan Srinivasan Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1326104915.2442.53.camel@twins Signed-off-by: Ingo Molnar --- Documentation/ABI/testing/sysfs-devices-system-cpu | 25 -- Documentation/scheduler/sched-domains.txt | 4 - arch/x86/kernel/smpboot.c | 3 +- drivers/base/cpu.c | 4 - include/linux/cpu.h | 2 - include/linux/sched.h | 47 ---- include/linux/topology.h | 5 - kernel/sched/core.c | 94 ------- kernel/sched/fair.c | 275 +-------------------- tools/power/cpupower/man/cpupower-set.1 | 9 - tools/power/cpupower/utils/helpers/sysfs.c | 35 +-- 11 files changed, 5 insertions(+), 498 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index e7be75b96e4b..5dab36448b44 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -9,31 +9,6 @@ Description: /sys/devices/system/cpu/cpu#/ -What: /sys/devices/system/cpu/sched_mc_power_savings - /sys/devices/system/cpu/sched_smt_power_savings -Date: June 2006 -Contact: Linux kernel mailing list -Description: Discover and adjust the kernel's multi-core scheduler support. - - Possible values are: - - 0 - No power saving load balance (default value) - 1 - Fill one thread/core/package first for long running threads - 2 - Also bias task wakeups to semi-idle cpu package for power - savings - - sched_mc_power_savings is dependent upon SCHED_MC, which is - itself architecture dependent. - - sched_smt_power_savings is dependent upon SCHED_SMT, which - is itself architecture dependent. - - The two files are independent of each other. It is possible - that one file may be present without the other. - - Introduced by git commit 5c45bf27. - - What: /sys/devices/system/cpu/kernel_max /sys/devices/system/cpu/offline /sys/devices/system/cpu/online diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt index b7ee379b651b..443f0c76bab4 100644 --- a/Documentation/scheduler/sched-domains.txt +++ b/Documentation/scheduler/sched-domains.txt @@ -61,10 +61,6 @@ The implementor should read comments in include/linux/sched.h: struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of the specifics and what to tune. -For SMT, the architecture must define CONFIG_SCHED_SMT and provide a -cpumask_t cpu_sibling_map[NR_CPUS], where cpu_sibling_map[i] is the mask of -all "i"'s siblings as well as "i" itself. - Architectures may retain the regular override the default SD_*_INIT flags while using the generic domain builder in kernel/sched.c if they wish to retain the traditional SMT->SMP->NUMA topology (or some subset of that). This diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e84c1bbea339..256c20cc5e96 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -429,8 +429,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) * For perf, we return last level cache shared map. * And for power savings, we return cpu_core_map */ - if ((sched_mc_power_savings || sched_smt_power_savings) && - !(cpu_has(c, X86_FEATURE_AMD_DCM))) + if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else return cpu_llc_shared_mask(cpu); diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index adf937bf4091..63452943abd1 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -330,8 +330,4 @@ void __init cpu_dev_init(void) panic("Failed to register CPU subsystem"); cpu_dev_register_generic(); - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - sched_create_sysfs_power_savings_entries(cpu_subsys.dev_root); -#endif } diff --git a/include/linux/cpu.h b/include/linux/cpu.h index ee28844ae68e..7230bb59a06f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -36,8 +36,6 @@ extern void cpu_remove_dev_attr(struct device_attribute *attr); extern int cpu_add_dev_attr_group(struct attribute_group *attrs); extern void cpu_remove_dev_attr_group(struct attribute_group *attrs); -extern int sched_create_sysfs_power_savings_entries(struct device *dev); - #ifdef CONFIG_HOTPLUG_CPU extern void unregister_cpu(struct cpu *cpu); extern ssize_t arch_cpu_probe(const char *, size_t); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4a559bf0622f..3d644809c9db 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -855,61 +855,14 @@ enum cpu_idle_type { #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ #define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ -#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ -enum powersavings_balance_level { - POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ - POWERSAVINGS_BALANCE_BASIC, /* Fill one thread/core/package - * first for long running threads - */ - POWERSAVINGS_BALANCE_WAKEUP, /* Also bias task wakeups to semi-idle - * cpu package for power savings - */ - MAX_POWERSAVINGS_BALANCE_LEVELS -}; - -extern int sched_mc_power_savings, sched_smt_power_savings; - -static inline int sd_balance_for_mc_power(void) -{ - if (sched_smt_power_savings) - return SD_POWERSAVINGS_BALANCE; - - if (!sched_mc_power_savings) - return SD_PREFER_SIBLING; - - return 0; -} - -static inline int sd_balance_for_package_power(void) -{ - if (sched_mc_power_savings | sched_smt_power_savings) - return SD_POWERSAVINGS_BALANCE; - - return SD_PREFER_SIBLING; -} - extern int __weak arch_sd_sibiling_asym_packing(void); -/* - * Optimise SD flags for power savings: - * SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings. - * Keep default SD flags if sched_{smt,mc}_power_saving=0 - */ - -static inline int sd_power_saving_flags(void) -{ - if (sched_mc_power_savings | sched_smt_power_savings) - return SD_BALANCE_NEWIDLE; - - return 0; -} - struct sched_group_power { atomic_t ref; /* diff --git a/include/linux/topology.h b/include/linux/topology.h index 4f59bf36f0af..09558d1daacd 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -98,7 +98,6 @@ int arch_update_cpu_topology(void); | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 1*SD_SHARE_CPUPOWER \ - | 0*SD_POWERSAVINGS_BALANCE \ | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ @@ -134,8 +133,6 @@ int arch_update_cpu_topology(void); | 0*SD_SHARE_CPUPOWER \ | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | sd_balance_for_mc_power() \ - | sd_power_saving_flags() \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -167,8 +164,6 @@ int arch_update_cpu_topology(void); | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | sd_balance_for_package_power() \ - | sd_power_saving_flags() \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bd314d7cd9f8..24ca677b5457 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5929,8 +5929,6 @@ static const struct cpumask *cpu_cpu_mask(int cpu) return cpumask_of_node(cpu_to_node(cpu)); } -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; - struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; @@ -6322,7 +6320,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_WAKE_AFFINE | 0*SD_PREFER_LOCAL | 0*SD_SHARE_CPUPOWER - | 0*SD_POWERSAVINGS_BALANCE | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE | 0*SD_PREFER_SIBLING @@ -6819,97 +6816,6 @@ match2: mutex_unlock(&sched_domains_mutex); } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void reinit_sched_domains(void) -{ - get_online_cpus(); - - /* Destroy domains first to force the rebuild */ - partition_sched_domains(0, NULL, NULL); - - rebuild_sched_domains(); - put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ - unsigned int level = 0; - - if (sscanf(buf, "%u", &level) != 1) - return -EINVAL; - - /* - * level is always be positive so don't check for - * level < POWERSAVINGS_BALANCE_NONE which is 0 - * What happens on 0 or 1 byte write, - * need to check for count as well? - */ - - if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) - return -EINVAL; - - if (smt) - sched_smt_power_savings = level; - else - sched_mc_power_savings = level; - - reinit_sched_domains(); - - return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 0); -} -static DEVICE_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 1); -} -static DEVICE_ATTR(sched_smt_power_savings, 0644, - sched_smt_power_savings_show, - sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct device *dev) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = device_create_file(dev, &dev_attr_sched_smt_power_savings); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = device_create_file(dev, &dev_attr_sched_mc_power_savings); -#endif - return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b42f4487329..940e6d17cf96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. */ - if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { + if (tmp->flags & (SD_PREFER_LOCAL)) { unsigned long power = 0; unsigned long nr_running = 0; unsigned long capacity; @@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - nr_running /= 2; - if (nr_running < capacity) want_sd = 0; } @@ -3435,14 +3432,6 @@ struct sd_lb_stats { unsigned int busiest_group_weight; int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance; /* Is powersave balance needed for this sd */ - struct sched_group *group_min; /* Least loaded group in sd */ - struct sched_group *group_leader; /* Group which relieves group_min */ - unsigned long min_load_per_task; /* load_per_task in group_min */ - unsigned long leader_nr_running; /* Nr running of group_leader */ - unsigned long min_nr_running; /* Nr running of group_min */ -#endif }; /* @@ -3486,147 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. - * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. - */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -} - -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - - if (!sds->power_savings_balance) - return; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (sds->this_nr_running >= sgs->group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs->sum_nr_running >= sgs->group_capacity || - !sgs->sum_nr_running) - return; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs->sum_nr_running < sds->min_nr_running) || - (sgs->sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs->sum_nr_running; - sds->min_load_per_task = sgs->sum_weighted_load / - sgs->sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs->sum_nr_running + 1 > sgs->group_capacity) - return; - - if (sgs->sum_nr_running > sds->leader_nr_running || - (sgs->sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs->sum_nr_running; - } -} - -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @env: load balance environment - * @sds: Variable containing the statistics of the sched_domain - * under consideration. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline -int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds) -{ - if (!sds->power_savings_balance) - return 0; - - if (sds->this != sds->group_leader || - sds->group_leader == sds->group_min) - return 0; - - env->imbalance = sds->min_load_per_task; - sds->busiest = sds->group_min; - - return 1; - -} -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - return; -} - -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - return; -} - -static inline -int check_power_save_busiest_group(struct lb_env *env, struct sd_lb_stats *sds) -{ - return 0; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - - unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; @@ -3932,7 +3780,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; - init_sd_power_savings_stats(env->sd, sds, env->idle); load_idx = get_sd_load_idx(env->sd, env->idle); do { @@ -3981,7 +3828,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(sg, sds, local_group, &sgs); sg = sg->next; } while (sg != env->sd->groups); } @@ -4276,12 +4122,6 @@ force_balance: return sds.busiest; out_balanced: - /* - * There is no obvious imbalance. But check if we can do some balancing - * to save power. - */ - if (check_power_save_busiest_group(env, &sds)) - return sds.busiest; ret: env->imbalance = 0; return NULL; @@ -4359,28 +4199,6 @@ static int need_active_balance(struct lb_env *env) */ if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) return 1; - - /* - * The only task running in a non-idle cpu can be moved to this - * cpu in an attempt to completely freeup the other CPU - * package. - * - * The package power saving logic comes from - * find_busiest_group(). If there are no imbalance, then - * f_b_g() will return NULL. However when sched_mc={1,2} then - * f_b_g() will select a group from which a running task may be - * pulled to this cpu in order to make the other package idle. - * If there is no opportunity to make a package idle and if - * there are no imbalance, then f_b_g() will return NULL and no - * action will be taken in load_balance_newidle(). - * - * Under normal task pull operation due to imbalance, there - * will be more than one task in the source run queue and - * move_tasks() will succeed. ld_moved will be true and this - * active balance code will not be triggered. - */ - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) - return 0; } return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); @@ -4700,104 +4518,15 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu: The cpu whose lowest level of sched domain is to - * be returned. - * @flag: The flag to check for the lowest sched_domain - * for the given cpu. - * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd; - - for_each_domain(cpu, sd) - if (sd->flags & flag) - break; - - return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu: The cpu whose domains we're iterating over. - * @sd: variable holding the value of the power_savings_sd - * for cpu. - * @flag: The flag to filter the sched_domains to be iterated. - * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ - for (sd = lowest_flag_domain(cpu, flag); \ - (sd && (sd->flags & flag)); sd = sd->parent) - -/** - * find_new_ilb - Finds the optimum idle load balancer for nomination. - * @cpu: The cpu which is nominating a new idle_load_balancer. - * - * Returns: Returns the id of the idle load balancer if it exists, - * Else, returns >= nr_cpu_ids. - * - * This algorithm picks the idle load balancer such that it belongs to a - * semi-idle powersavings sched_domain. The idea is to try and avoid - * completely idle packages/cores just for the purpose of idle load balancing - * when there are other idle cpu's which are better suited for that job. - */ -static int find_new_ilb(int cpu) +static inline int find_new_ilb(int call_cpu) { int ilb = cpumask_first(nohz.idle_cpus_mask); - struct sched_group *ilbg; - struct sched_domain *sd; - /* - * Have idle load balancer selection from semi-idle packages only - * when power-aware load balancing is enabled - */ - if (!(sched_smt_power_savings || sched_mc_power_savings)) - goto out_done; - - /* - * Optimize for the case when we have no idle CPUs or only one - * idle CPU. Don't walk the sched_domain hierarchy in such cases - */ - if (cpumask_weight(nohz.idle_cpus_mask) < 2) - goto out_done; - - rcu_read_lock(); - for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { - ilbg = sd->groups; - - do { - if (ilbg->group_weight != - atomic_read(&ilbg->sgp->nr_busy_cpus)) { - ilb = cpumask_first_and(nohz.idle_cpus_mask, - sched_group_cpus(ilbg)); - goto unlock; - } - - ilbg = ilbg->next; - - } while (ilbg != sd->groups); - } -unlock: - rcu_read_unlock(); - -out_done: if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; return nr_cpu_ids; } -#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ -static inline int find_new_ilb(int call_cpu) -{ - return nr_cpu_ids; -} -#endif /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick the diff --git a/tools/power/cpupower/man/cpupower-set.1 b/tools/power/cpupower/man/cpupower-set.1 index c4954a9fe4e7..9dbd536518ab 100644 --- a/tools/power/cpupower/man/cpupower-set.1 +++ b/tools/power/cpupower/man/cpupower-set.1 @@ -85,15 +85,6 @@ Possible values are: savings .RE -sched_mc_power_savings is dependent upon SCHED_MC, which is -itself architecture dependent. - -sched_smt_power_savings is dependent upon SCHED_SMT, which -is itself architecture dependent. - -The two files are independent of each other. It is possible -that one file may be present without the other. - .SH "SEE ALSO" cpupower-info(1), cpupower-monitor(1), powertop(1) .PP diff --git a/tools/power/cpupower/utils/helpers/sysfs.c b/tools/power/cpupower/utils/helpers/sysfs.c index c6343024a611..96e28c124b5c 100644 --- a/tools/power/cpupower/utils/helpers/sysfs.c +++ b/tools/power/cpupower/utils/helpers/sysfs.c @@ -362,22 +362,7 @@ char *sysfs_get_cpuidle_driver(void) */ int sysfs_get_sched(const char *smt_mc) { - unsigned long value; - char linebuf[MAX_LINE_LEN]; - char *endp; - char path[SYSFS_PATH_MAX]; - - if (strcmp("mc", smt_mc) && strcmp("smt", smt_mc)) - return -EINVAL; - - snprintf(path, sizeof(path), - PATH_TO_CPU "sched_%s_power_savings", smt_mc); - if (sysfs_read_file(path, linebuf, MAX_LINE_LEN) == 0) - return -1; - value = strtoul(linebuf, &endp, 0); - if (endp == linebuf || errno == ERANGE) - return -1; - return value; + return -ENODEV; } /* @@ -388,21 +373,5 @@ int sysfs_get_sched(const char *smt_mc) */ int sysfs_set_sched(const char *smt_mc, int val) { - char linebuf[MAX_LINE_LEN]; - char path[SYSFS_PATH_MAX]; - struct stat statbuf; - - if (strcmp("mc", smt_mc) && strcmp("smt", smt_mc)) - return -EINVAL; - - snprintf(path, sizeof(path), - PATH_TO_CPU "sched_%s_power_savings", smt_mc); - sprintf(linebuf, "%d", val); - - if (stat(path, &statbuf) != 0) - return -ENODEV; - - if (sysfs_write_file(path, linebuf, MAX_LINE_LEN) == 0) - return -1; - return 0; + return -ENODEV; } -- cgit v1.2.3 From 80b3e557371205566a71e569fbfcce5b11f92dbe Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 15 May 2012 18:44:15 +0100 Subject: x86: Fix boot on Twinhead H12Y Despite lots of investigation into why this is needed we don't know or have an elegant cure. The only answer found on this laptop is to mark a problem region as used so that Linux doesn't put anything there. Currently all the users add reserve= command lines and anyone not knowing this needs to find the magic page that documents it. Automate it instead. Signed-off-by: Alan Cox Tested-and-bugfixed-by: Arne Fitzenreiter Resolves-bug: https://bugzilla.kernel.org/show_bug.cgi?id=10231 Link: http://lkml.kernel.org/r/20120515174347.5109.94551.stgit@bluebook Signed-off-by: Ingo Molnar --- arch/x86/pci/fixup.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index d0e6e403b4f6..5dd467bd6121 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -519,3 +519,20 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev) } } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); + +/* + * Twinhead H12Y needs us to block out a region otherwise we map devices + * there and any access kills the box. + * + * See: https://bugzilla.kernel.org/show_bug.cgi?id=10231 + * + * Match off the LPC and svid/sdid (older kernels lose the bridge subvendor) + */ +static void __devinit twinhead_reserve_killing_zone(struct pci_dev *dev) +{ + if (dev->subsystem_vendor == 0x14FF && dev->subsystem_device == 0xA003) { + pr_info("Reserving memory on Twinhead H12Y\n"); + request_mem_region(0xFFB00000, 0x100000, "twinhead"); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); -- cgit v1.2.3 From bb8187d35f820671d6dd76700d77a6b55f95e2c5 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 17 May 2012 19:06:13 -0400 Subject: MCA: delete all remaining traces of microchannel bus support. Hardware with MCA bus is limited to 386 and 486 class machines that are now 20+ years old and typically with less than 32MB of memory. A quick search on the internet, and you see that even the MCA hobbyist/enthusiast community has lost interest in the early 2000 era and never really even moved ahead from the 2.4 kernels to the 2.6 series. This deletes anything remaining related to CONFIG_MCA from core kernel code and from the x86 architecture. There is no point in carrying this any further into the future. One complication to watch for is inadvertently scooping up stuff relating to machine check, since there is overlap in the TLA name space (e.g. arch/x86/boot/mca.c). Cc: Thomas Gleixner Cc: James Bottomley Cc: x86@kernel.org Acked-by: Ingo Molnar Acked-by: H. Peter Anvin Signed-off-by: Paul Gortmaker --- Documentation/00-INDEX | 2 - Documentation/DocBook/Makefile | 2 +- Documentation/DocBook/kernel-api.tmpl | 13 - Documentation/DocBook/mcabook.tmpl | 107 -------- Documentation/devices.txt | 8 +- Documentation/eisa.txt | 2 +- Documentation/kernel-parameters.txt | 1 - Documentation/mca.txt | 313 ---------------------- MAINTAINERS | 13 - arch/frv/include/asm/processor.h | 1 - arch/x86/Kconfig | 10 - arch/x86/include/asm/mca.h | 43 --- arch/x86/include/asm/mca_dma.h | 201 -------------- arch/x86/include/asm/mpspec.h | 2 +- arch/x86/include/asm/mpspec_def.h | 3 +- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/io_apic.c | 17 +- arch/x86/kernel/mca_32.c | 476 ---------------------------------- arch/x86/kernel/mpparse.c | 11 +- arch/x86/kernel/nmi.c | 12 - arch/x86/kernel/setup.c | 8 - arch/x86/kernel/time.c | 6 - arch/x86/kernel/traps.c | 4 - drivers/Makefile | 1 - drivers/mca/Kconfig | 14 - drivers/mca/Makefile | 7 - drivers/mca/mca-bus.c | 169 ------------ drivers/mca/mca-device.c | 218 ---------------- drivers/mca/mca-driver.c | 63 ----- drivers/mca/mca-legacy.c | 329 ----------------------- drivers/mca/mca-proc.c | 249 ------------------ drivers/message/i2o/i2o_proc.c | 13 - include/linux/i2o-dev.h | 2 +- include/linux/mca-legacy.h | 66 ----- include/linux/mca.h | 148 ----------- scripts/kconfig/mconf.c | 2 +- scripts/kconfig/nconf.c | 2 +- 38 files changed, 15 insertions(+), 2526 deletions(-) delete mode 100644 Documentation/DocBook/mcabook.tmpl delete mode 100644 Documentation/mca.txt delete mode 100644 arch/x86/include/asm/mca.h delete mode 100644 arch/x86/include/asm/mca_dma.h delete mode 100644 arch/x86/kernel/mca_32.c delete mode 100644 drivers/mca/Kconfig delete mode 100644 drivers/mca/Makefile delete mode 100644 drivers/mca/mca-bus.c delete mode 100644 drivers/mca/mca-device.c delete mode 100644 drivers/mca/mca-driver.c delete mode 100644 drivers/mca/mca-legacy.c delete mode 100644 drivers/mca/mca-proc.c delete mode 100644 include/linux/mca-legacy.h delete mode 100644 include/linux/mca.h (limited to 'arch/x86') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 2214f123a976..49c051380daf 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -218,8 +218,6 @@ m68k/ - directory with info about Linux on Motorola 68k architecture. magic-number.txt - list of magic numbers used to mark/protect kernel data structures. -mca.txt - - info on supporting Micro Channel Architecture (e.g. PS/2) systems. md.txt - info on boot arguments for the multiple devices driver. memory-barriers.txt diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 66725a3d30dc..bc3d9f8c0a90 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -6,7 +6,7 @@ # To add a new book the only step required is to add the book to the # list of DOCBOOKS. -DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ +DOCBOOKS := z8530book.xml device-drivers.xml \ kernel-hacking.xml kernel-locking.xml deviceiobook.xml \ writing_usb_driver.xml networking.xml \ kernel-api.xml filesystems.xml lsm.xml usb.xml kgdb.xml \ diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index 7160652a8736..00687ee9d363 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -212,19 +212,6 @@ X!Edrivers/pci/hotplug.c PCI Hotplug Support Library !Edrivers/pci/hotplug/pci_hotplug_core.c - MCA Architecture - MCA Device Functions - - Refer to the file arch/x86/kernel/mca_32.c for more information. - - - - MCA Bus DMA -!Iarch/x86/include/asm/mca_dma.h - - diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl deleted file mode 100644 index 467ccac6ec50..000000000000 --- a/Documentation/DocBook/mcabook.tmpl +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - MCA Driver Programming Interface - - - - Alan - Cox - -
- alan@lxorguk.ukuu.org.uk -
-
-
- - David - Weinehall - - - Chris - Beauregard - -
- - - 2000 - Alan Cox - David Weinehall - Chris Beauregard - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - The MCA bus functions provide a generalised interface to find MCA - bus cards, to claim them for a driver, and to read and manipulate POS - registers without being aware of the motherboard internals or - certain deep magic specific to onboard devices. - - - The basic interface to the MCA bus devices is the slot. Each slot - is numbered and virtual slot numbers are assigned to the internal - devices. Using a pci_dev as other busses do does not really make - sense in the MCA context as the MCA bus resources require card - specific interpretation. - - - Finally the MCA bus functions provide a parallel set of DMA - functions mimicing the ISA bus DMA functions as closely as possible, - although also supporting the additional DMA functionality on the - MCA bus controllers. - - - - Known Bugs And Assumptions - - None. - - - - - Public Functions Provided -!Edrivers/mca/mca-legacy.c - - - - DMA Functions Provided -!Iarch/x86/include/asm/mca_dma.h - - -
diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 00383186d8fb..c162be1c3234 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt @@ -846,13 +846,7 @@ Your cooperation is appreciated. ... 31 = /dev/tap15 16th Ethertap device - 36 block MCA ESDI hard disk - 0 = /dev/eda First ESDI disk whole disk - 64 = /dev/edb Second ESDI disk whole disk - ... - - Partitions are handled in the same way as IDE disks - (see major number 3). + 36 block OBSOLETE (was MCA ESDI hard disk) 37 char IDE tape 0 = /dev/ht0 First IDE tape diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt index 38cf0c7b559f..a55e4910924e 100644 --- a/Documentation/eisa.txt +++ b/Documentation/eisa.txt @@ -179,7 +179,7 @@ CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set. Converting an EISA driver to the new API mostly involves *deleting* code (since probing is now in the core EISA code). Unfortunately, most -drivers share their probing routine between ISA, MCA and EISA. Special +drivers share their probing routine between ISA, and EISA. Special care must be taken when ripping out the EISA code, so other busses won't suffer from these surgical strikes... diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index c1601e5a8b71..38cad53620cc 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -70,7 +70,6 @@ parameter is applicable: M68k M68k architecture is enabled. These options have more detailed description inside of Documentation/m68k/kernel-options.txt. - MCA MCA bus support is enabled. MDA MDA console support is enabled. MIPS MIPS architecture is enabled. MOUSE Appropriate mouse support is enabled. diff --git a/Documentation/mca.txt b/Documentation/mca.txt deleted file mode 100644 index dfd130c2207d..000000000000 --- a/Documentation/mca.txt +++ /dev/null @@ -1,313 +0,0 @@ -i386 Micro Channel Architecture Support -======================================= - -MCA support is enabled using the CONFIG_MCA define. A machine with a MCA -bus will have the kernel variable MCA_bus set, assuming the BIOS feature -bits are set properly (see arch/i386/boot/setup.S for information on -how this detection is done). - -Adapter Detection -================= - -The ideal MCA adapter detection is done through the use of the -Programmable Option Select registers. Generic functions for doing -this have been added in include/linux/mca.h and arch/x86/kernel/mca_32.c. -Everything needed to detect adapters and read (and write) configuration -information is there. A number of MCA-specific drivers already use -this. The typical probe code looks like the following: - - #include - - unsigned char pos2, pos3, pos4, pos5; - struct net_device* dev; - int slot; - - if( MCA_bus ) { - slot = mca_find_adapter( ADAPTER_ID, 0 ); - if( slot == MCA_NOTFOUND ) { - return -ENODEV; - } - /* optional - see below */ - mca_set_adapter_name( slot, "adapter name & description" ); - mca_set_adapter_procfn( slot, dev_getinfo, dev ); - - /* read the POS registers. Most devices only use 2 and 3 */ - pos2 = mca_read_stored_pos( slot, 2 ); - pos3 = mca_read_stored_pos( slot, 3 ); - pos4 = mca_read_stored_pos( slot, 4 ); - pos5 = mca_read_stored_pos( slot, 5 ); - } else { - return -ENODEV; - } - - /* extract configuration from pos[2345] and set everything up */ - -Loadable modules should modify this to test that the specified IRQ and -IO ports (plus whatever other stuff) match. See 3c523.c for example -code (actually, smc-mca.c has a slightly more complex example that can -handle a list of adapter ids). - -Keep in mind that devices should never directly access the POS registers -(via inb(), outb(), etc). While it's generally safe, there is a small -potential for blowing up hardware when it's done at the wrong time. -Furthermore, accessing a POS register disables a device temporarily. -This is usually okay during startup, but do _you_ want to rely on it? -During initial configuration, mca_init() reads all the POS registers -into memory. mca_read_stored_pos() accesses that data. mca_read_pos() -and mca_write_pos() are also available for (safer) direct POS access, -but their use is _highly_ discouraged. mca_write_pos() is particularly -dangerous, as it is possible for adapters to be put in inconsistent -states (i.e. sharing IO address, etc) and may result in crashes, toasted -hardware, and blindness. - -User level drivers (such as the AGX X server) can use /proc/mca/pos to -find adapters (see below). - -Some MCA adapters can also be detected via the usual ISA-style device -probing (many SCSI adapters, for example). This sort of thing is highly -discouraged. Perfectly good information is available telling you what's -there, so there's no excuse for messing with random IO ports. However, -we MCA people still appreciate any ISA-style driver that will work with -our hardware. You take what you can get... - -Level-Triggered Interrupts -========================== - -Because MCA uses level-triggered interrupts, a few problems arise with -what might best be described as the ISA mindset and its effects on -drivers. These sorts of problems are expected to become less common as -more people use shared IRQs on PCI machines. - -In general, an interrupt must be acknowledged not only at the ICU (which -is done automagically by the kernel), but at the device level. In -particular, IRQ 0 must be reset after a timer interrupt (now done in -arch/x86/kernel/time.c) or the first timer interrupt hangs the system. -There were also problems with the 1.3.x floppy drivers, but that seems -to have been fixed. - -IRQs are also shareable, and most MCA-specific devices should be coded -with shared IRQs in mind. - -/proc/mca -========= - -/proc/mca is a directory containing various files for adapters and -other stuff. - - /proc/mca/pos Straight listing of POS registers - /proc/mca/slot[1-8] Information on adapter in specific slot - /proc/mca/video Same for integrated video - /proc/mca/scsi Same for integrated SCSI - /proc/mca/machine Machine information - -See Appendix A for a sample. - -Device drivers can easily add their own information function for -specific slots (including integrated ones) via the -mca_set_adapter_procfn() call. Drivers that support this are ESDI, IBM -SCSI, and 3c523. If a device is also a module, make sure that the proc -function is removed in the module cleanup. This will require storing -the slot information in a private structure somewhere. See the 3c523 -driver for details. - -Your typical proc function will look something like this: - - static int - dev_getinfo( char* buf, int slot, void* d ) { - struct net_device* dev = (struct net_device*) d; - int len = 0; - - len += sprintf( buf+len, "Device: %s\n", dev->name ); - len += sprintf( buf+len, "IRQ: %d\n", dev->irq ); - len += sprintf( buf+len, "IO Port: %#lx-%#lx\n", ... ); - ... - - return len; - } - -Some of the standard MCA information will already be printed, so don't -bother repeating it. Don't try putting in more than 3K of information. - -Enable this function with: - mca_set_adapter_procfn( slot, dev_getinfo, dev ); - -Disable it with: - mca_set_adapter_procfn( slot, NULL, NULL ); - -It is also recommended that, even if you don't write a proc function, to -set the name of the adapter (i.e. "PS/2 ESDI Controller") via -mca_set_adapter_name( int slot, char* name ). - -MCA Device Drivers -================== - -Currently, there are a number of MCA-specific device drivers. - -1) PS/2 SCSI - drivers/scsi/ibmmca.c - drivers/scsi/ibmmca.h - The driver for the IBM SCSI subsystem. Includes both integrated - controllers and adapter cards. May require command-line arg - "ibmmcascsi=io_port" to force detection of an adapter. If you have a - machine with a front-panel display (i.e. model 95), you can use - "ibmmcascsi=display" to enable a drive activity indicator. - -2) 3c523 - drivers/net/3c523.c - drivers/net/3c523.h - 3Com 3c523 Etherlink/MC ethernet driver. - -3) SMC Ultra/MCA and IBM Adapter/A - drivers/net/smc-mca.c - drivers/net/smc-mca.h - Driver for the MCA version of the SMC Ultra and various other - OEM'ed and work-alike cards (Elite, Adapter/A, etc). - -4) NE/2 - driver/net/ne2.c - driver/net/ne2.h - The NE/2 is the MCA version of the NE2000. This may not work - with clones that have a different adapter id than the original - NE/2. - -5) Future Domain MCS-600/700, OEM'd IBM Fast SCSI Adapter/A and - Reply Sound Blaster/SCSI (SCSI part) - Better support for these cards than the driver for ISA. - Supports multiple cards with IRQ sharing. - -Also added boot time option of scsi-probe, which can do reordering of -SCSI host adapters. This will direct the kernel on the order which -SCSI adapter should be detected. Example: - scsi-probe=ibmmca,fd_mcs,adaptec1542,buslogic - -The serial drivers were modified to support the extended IO port range -of the typical MCA system (also #ifdef CONFIG_MCA). - -The following devices work with existing drivers: -1) Token-ring -2) Future Domain SCSI (MCS-600, MCS-700, not MCS-350, OEM'ed IBM SCSI) -3) Adaptec 1640 SCSI (using the aha1542 driver) -4) Bustek/Buslogic SCSI (various) -5) Probably all Arcnet cards. -6) Some, possibly all, MCA IDE controllers. -7) 3Com 3c529 (MCA version of 3c509) (patched) - -8) Intel EtherExpressMC (patched version) - You need to have CONFIG_MCA defined to have EtherExpressMC support. -9) Reply Sound Blaster/SCSI (SB part) (patched version) - -Bugs & Other Weirdness -====================== - -NMIs tend to occur with MCA machines because of various hardware -weirdness, bus timeouts, and many other non-critical things. Some basic -code to handle them (inspired by the NetBSD MCA code) has been added to -detect the guilty device, but it's pretty incomplete. If NMIs are a -persistent problem (on some model 70 or 80s, they occur every couple -shell commands), the CONFIG_IGNORE_NMI flag will take care of that. - -Various Pentium machines have had serious problems with the FPU test in -bugs.h. Basically, the machine hangs after the HLT test. This occurs, -as far as we know, on the Pentium-equipped 85s, 95s, and some PC Servers. -The PCI/MCA PC 750s are fine as far as I can tell. The ``mca-pentium'' -boot-prompt flag will disable the FPU bug check if this is a problem -with your machine. - -The model 80 has a raft of problems that are just too weird and unique -to get into here. Some people have no trouble while others have nothing -but problems. I'd suspect some problems are related to the age of the -average 80 and accompanying hardware deterioration, although others -are definitely design problems with the hardware. Among the problems -include SCSI controller problems, ESDI controller problems, and serious -screw-ups in the floppy controller. Oh, and the parallel port is also -pretty flaky. There were about 5 or 6 different model 80 motherboards -produced to fix various obscure problems. As far as I know, it's pretty -much impossible to tell which bugs a particular model 80 has (other than -triggering them, that is). - -Drivers are required for some MCA memory adapters. If you're suddenly -short a few megs of RAM, this might be the reason. The (I think) Enhanced -Memory Adapter commonly found on the model 70 is one. There's a very -alpha driver floating around, but it's pretty ugly (disassembled from -the DOS driver, actually). See the MCA Linux web page (URL below) -for more current memory info. - -The Thinkpad 700 and 720 will work, but various components are either -non-functional, flaky, or we don't know anything about them. The -graphics controller is supposed to be some WD, but we can't get things -working properly. The PCMCIA slots don't seem to work. Ditto for APM. -The serial ports work, but detection seems to be flaky. - -Credits -======= -A whole pile of people have contributed to the MCA code. I'd include -their names here, but I don't have a list handy. Check the MCA Linux -home page (URL below) for a perpetually out-of-date list. - -===================================================================== -MCA Linux Home Page: http://www.dgmicro.com/mca/ - -Christophe Beauregard -chrisb@truespectra.com -cpbeaure@calum.csclub.uwaterloo.ca - -===================================================================== -Appendix A: Sample /proc/mca - -This is from my model 8595. Slot 1 contains the standard IBM SCSI -adapter, slot 3 is an Adaptec AHA-1640, slot 5 is a XGA-1 video adapter, -and slot 7 is the 3c523 Etherlink/MC. - -/proc/mca/machine: -Model Id: 0xf8 -Submodel Id: 0x14 -BIOS Revision: 0x5 - -/proc/mca/pos: -Slot 1: ff 8e f1 fc a0 ff ff ff IBM SCSI Adapter w/Cache -Slot 2: ff ff ff ff ff ff ff ff -Slot 3: 1f 0f 81 3b bf b6 ff ff -Slot 4: ff ff ff ff ff ff ff ff -Slot 5: db 8f 1d 5e fd c0 00 00 -Slot 6: ff ff ff ff ff ff ff ff -Slot 7: 42 60 ff 08 ff ff ff ff 3Com 3c523 Etherlink/MC -Slot 8: ff ff ff ff ff ff ff ff -Video : ff ff ff ff ff ff ff ff -SCSI : ff ff ff ff ff ff ff ff - -/proc/mca/slot1: -Slot: 1 -Adapter Name: IBM SCSI Adapter w/Cache -Id: 8eff -Enabled: Yes -POS: ff 8e f1 fc a0 ff ff ff -Subsystem PUN: 7 -Detected at boot: Yes - -/proc/mca/slot3: -Slot: 3 -Adapter Name: Unknown -Id: 0f1f -Enabled: Yes -POS: 1f 0f 81 3b bf b6 ff ff - -/proc/mca/slot5: -Slot: 5 -Adapter Name: Unknown -Id: 8fdb -Enabled: Yes -POS: db 8f 1d 5e fd c0 00 00 - -/proc/mca/slot7: -Slot: 7 -Adapter Name: 3Com 3c523 Etherlink/MC -Id: 6042 -Enabled: Yes -POS: 42 60 ff 08 ff ff ff ff -Revision: 0xe -IRQ: 9 -IO Address: 0x3300-0x3308 -Memory: 0xd8000-0xdbfff -Transceiver: External -Device: eth0 -Hardware Address: 02 60 8c 45 c4 2a diff --git a/MAINTAINERS b/MAINTAINERS index 490dd6e640ac..9fa728b53d4e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3316,12 +3316,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git S: Maintained F: arch/ia64/ -IBM MCA SCSI SUBSYSTEM DRIVER -M: Michael Lang -W: http://www.uni-mainz.de/~langm000/linux.html -S: Maintained -F: drivers/scsi/ibmmca.c - IBM Power Linux RAID adapter M: Brian King S: Supported @@ -4418,13 +4412,6 @@ T: git git://git.monstr.eu/linux-2.6-microblaze.git S: Supported F: arch/microblaze/ -MICROCHANNEL ARCHITECTURE (MCA) -M: James Bottomley -S: Maintained -F: Documentation/mca.txt -F: drivers/mca/ -F: include/linux/mca* - MICROTEK X6 SCANNER M: Oliver Neukum S: Maintained diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index 81c2e271d620..4a53811cd4cd 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -54,7 +54,6 @@ extern struct cpuinfo_frv __nongprelbss boot_cpu_data; * Bus types */ #define EISA_bus 0 -#define MCA_bus 0 struct thread_struct { struct pt_regs *frame; /* [GR28] exception frame ptr for this thread */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9866b0b77d8..9137057152c3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2023,16 +2023,6 @@ config EISA source "drivers/eisa/Kconfig" -config MCA - bool "MCA support" - ---help--- - MicroChannel Architecture is found in some IBM PS/2 machines and - laptops. It is a bus system similar to PCI or ISA. See - (and especially the web page given - there) before attempting to build an MCA bus kernel. - -source "drivers/mca/Kconfig" - config SCx200 tristate "NatSemi SCx200 support" ---help--- diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h deleted file mode 100644 index eedbb6cc1efb..000000000000 --- a/arch/x86/include/asm/mca.h +++ /dev/null @@ -1,43 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* Platform specific MCA defines */ -#ifndef _ASM_X86_MCA_H -#define _ASM_X86_MCA_H - -/* Maximal number of MCA slots - actually, some machines have less, but - * they all have sufficient number of POS registers to cover 8. - */ -#define MCA_MAX_SLOT_NR 8 - -/* Most machines have only one MCA bus. The only multiple bus machines - * I know have at most two */ -#define MAX_MCA_BUSSES 2 - -#define MCA_PRIMARY_BUS 0 -#define MCA_SECONDARY_BUS 1 - -/* Dummy slot numbers on primary MCA for integrated functions */ -#define MCA_INTEGSCSI (MCA_MAX_SLOT_NR) -#define MCA_INTEGVIDEO (MCA_MAX_SLOT_NR+1) -#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2) - -/* Dummy POS values for integrated functions */ -#define MCA_DUMMY_POS_START 0x10000 -#define MCA_INTEGSCSI_POS (MCA_DUMMY_POS_START+1) -#define MCA_INTEGVIDEO_POS (MCA_DUMMY_POS_START+2) -#define MCA_MOTHERBOARD_POS (MCA_DUMMY_POS_START+3) - -/* MCA registers */ - -#define MCA_MOTHERBOARD_SETUP_REG 0x94 -#define MCA_ADAPTER_SETUP_REG 0x96 -#define MCA_POS_REG(n) (0x100+(n)) - -#define MCA_ENABLED 0x01 /* POS 2, set if adapter enabled */ - -/* Max number of adapters, including both slots and various integrated - * things. - */ -#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3) - -#endif /* _ASM_X86_MCA_H */ diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h deleted file mode 100644 index 45271aef82dd..000000000000 --- a/arch/x86/include/asm/mca_dma.h +++ /dev/null @@ -1,201 +0,0 @@ -#ifndef _ASM_X86_MCA_DMA_H -#define _ASM_X86_MCA_DMA_H - -#include -#include - -/* - * Microchannel specific DMA stuff. DMA on an MCA machine is fairly similar to - * standard PC dma, but it certainly has its quirks. DMA register addresses - * are in a different place and there are some added functions. Most of this - * should be pretty obvious on inspection. Note that the user must divide - * count by 2 when using 16-bit dma; that is not handled by these functions. - * - * Ramen Noodles are yummy. - * - * 1998 Tymm Twillman - */ - -/* - * Registers that are used by the DMA controller; FN is the function register - * (tell the controller what to do) and EXE is the execution register (how - * to do it) - */ - -#define MCA_DMA_REG_FN 0x18 -#define MCA_DMA_REG_EXE 0x1A - -/* - * Functions that the DMA controller can do - */ - -#define MCA_DMA_FN_SET_IO 0x00 -#define MCA_DMA_FN_SET_ADDR 0x20 -#define MCA_DMA_FN_GET_ADDR 0x30 -#define MCA_DMA_FN_SET_COUNT 0x40 -#define MCA_DMA_FN_GET_COUNT 0x50 -#define MCA_DMA_FN_GET_STATUS 0x60 -#define MCA_DMA_FN_SET_MODE 0x70 -#define MCA_DMA_FN_SET_ARBUS 0x80 -#define MCA_DMA_FN_MASK 0x90 -#define MCA_DMA_FN_RESET_MASK 0xA0 -#define MCA_DMA_FN_MASTER_CLEAR 0xD0 - -/* - * Modes (used by setting MCA_DMA_FN_MODE in the function register) - * - * Note that the MODE_READ is read from memory (write to device), and - * MODE_WRITE is vice-versa. - */ - -#define MCA_DMA_MODE_XFER 0x04 /* read by default */ -#define MCA_DMA_MODE_READ 0x04 /* same as XFER */ -#define MCA_DMA_MODE_WRITE 0x08 /* OR with MODE_XFER to use */ -#define MCA_DMA_MODE_IO 0x01 /* DMA from IO register */ -#define MCA_DMA_MODE_16 0x40 /* 16 bit xfers */ - - -/** - * mca_enable_dma - channel to enable DMA on - * @dmanr: DMA channel - * - * Enable the MCA bus DMA on a channel. This can be called from - * IRQ context. - */ - -static inline void mca_enable_dma(unsigned int dmanr) -{ - outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN); -} - -/** - * mca_disble_dma - channel to disable DMA on - * @dmanr: DMA channel - * - * Enable the MCA bus DMA on a channel. This can be called from - * IRQ context. - */ - -static inline void mca_disable_dma(unsigned int dmanr) -{ - outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN); -} - -/** - * mca_set_dma_addr - load a 24bit DMA address - * @dmanr: DMA channel - * @a: 24bit bus address - * - * Load the address register in the DMA controller. This has a 24bit - * limitation (16Mb). - */ - -static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a) -{ - outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN); - outb(a & 0xff, MCA_DMA_REG_EXE); - outb((a >> 8) & 0xff, MCA_DMA_REG_EXE); - outb((a >> 16) & 0xff, MCA_DMA_REG_EXE); -} - -/** - * mca_get_dma_addr - load a 24bit DMA address - * @dmanr: DMA channel - * - * Read the address register in the DMA controller. This has a 24bit - * limitation (16Mb). The return is a bus address. - */ - -static inline unsigned int mca_get_dma_addr(unsigned int dmanr) -{ - unsigned int addr; - - outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN); - addr = inb(MCA_DMA_REG_EXE); - addr |= inb(MCA_DMA_REG_EXE) << 8; - addr |= inb(MCA_DMA_REG_EXE) << 16; - - return addr; -} - -/** - * mca_set_dma_count - load a 16bit transfer count - * @dmanr: DMA channel - * @count: count - * - * Set the DMA count for this channel. This can be up to 64Kbytes. - * Setting a count of zero will not do what you expect. - */ - -static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count) -{ - count--; /* transfers one more than count -- correct for this */ - - outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN); - outb(count & 0xff, MCA_DMA_REG_EXE); - outb((count >> 8) & 0xff, MCA_DMA_REG_EXE); -} - -/** - * mca_get_dma_residue - get the remaining bytes to transfer - * @dmanr: DMA channel - * - * This function returns the number of bytes left to transfer - * on this DMA channel. - */ - -static inline unsigned int mca_get_dma_residue(unsigned int dmanr) -{ - unsigned short count; - - outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN); - count = 1 + inb(MCA_DMA_REG_EXE); - count += inb(MCA_DMA_REG_EXE) << 8; - - return count; -} - -/** - * mca_set_dma_io - set the port for an I/O transfer - * @dmanr: DMA channel - * @io_addr: an I/O port number - * - * Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer - * with an I/O port target. - */ - -static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr) -{ - /* - * DMA from a port address -- set the io address - */ - - outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN); - outb(io_addr & 0xff, MCA_DMA_REG_EXE); - outb((io_addr >> 8) & 0xff, MCA_DMA_REG_EXE); -} - -/** - * mca_set_dma_mode - set the DMA mode - * @dmanr: DMA channel - * @mode: mode to set - * - * The DMA controller supports several modes. The mode values you can - * set are- - * - * %MCA_DMA_MODE_READ when reading from the DMA device. - * - * %MCA_DMA_MODE_WRITE to writing to the DMA device. - * - * %MCA_DMA_MODE_IO to do DMA to or from an I/O port. - * - * %MCA_DMA_MODE_16 to do 16bit transfers. - */ - -static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode) -{ - outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN); - outb(mode, MCA_DMA_REG_EXE); -} - -#endif /* _ASM_X86_MCA_DMA_H */ diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 9c7d95f6174b..3e2f42a4b872 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -40,7 +40,7 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #endif /* CONFIG_X86_64 */ -#if defined(CONFIG_MCA) || defined(CONFIG_EISA) +#ifdef CONFIG_EISA extern int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index c0a955a9a087..b31f8c098271 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -84,7 +84,7 @@ struct mpc_bus { #define BUSTYPE_EISA "EISA" #define BUSTYPE_ISA "ISA" #define BUSTYPE_INTERN "INTERN" /* Internal BUS */ -#define BUSTYPE_MCA "MCA" +#define BUSTYPE_MCA "MCA" /* Obsolete */ #define BUSTYPE_VL "VL" /* Local bus */ #define BUSTYPE_PCI "PCI" #define BUSTYPE_PCMCIA "PCMCIA" @@ -169,6 +169,5 @@ enum mp_bustype { MP_BUS_ISA = 1, MP_BUS_EISA, MP_BUS_PCI, - MP_BUS_MCA, }; #endif /* _ASM_X86_MPSPEC_DEF_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 532d2e090e6f..7d1569947204 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -49,7 +49,6 @@ obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o obj-$(CONFIG_X86_32) += reboot_32.o -obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_PCI) += early-quirks.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a415b1f44365..f564b189de1a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -990,7 +990,7 @@ void __init mp_config_acpi_legacy_irqs(void) int i; struct mpc_intsrc mp_irq; -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +#ifdef CONFIG_EISA /* * Fabricate the legacy ISA bus (bus #31). */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e88300d8e80a..675e9045a3c5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -142,7 +142,7 @@ int mp_irq_entries; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +#ifdef CONFIG_EISA int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -875,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA /* * EISA Edge/Level control register, ELCR */ @@ -912,12 +912,6 @@ static int EISA_ELCR(unsigned int irq) #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; @@ -975,7 +969,7 @@ static int irq_trigger(int idx) trigger = default_ISA_trigger(idx); else trigger = default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA switch (mp_bus_id_to_type[bus]) { case MP_BUS_ISA: /* ISA pin */ { @@ -992,11 +986,6 @@ static int irq_trigger(int idx) /* set before the switch */ break; } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c deleted file mode 100644 index 7eb1e2b97827..000000000000 --- a/arch/x86/kernel/mca_32.c +++ /dev/null @@ -1,476 +0,0 @@ -/* - * Written by Martin Kolinek, February 1996 - * - * Changes: - * - * Chris Beauregard July 28th, 1996 - * - Fixed up integrated SCSI detection - * - * Chris Beauregard August 3rd, 1996 - * - Made mca_info local - * - Made integrated registers accessible through standard function calls - * - Added name field - * - More sanity checking - * - * Chris Beauregard August 9th, 1996 - * - Rewrote /proc/mca - * - * Chris Beauregard January 7th, 1997 - * - Added basic NMI-processing - * - Added more information to mca_info structure - * - * David Weinehall October 12th, 1998 - * - Made a lot of cleaning up in the source - * - Added use of save_flags / restore_flags - * - Added the 'driver_loaded' flag in MCA_adapter - * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter - * - * David Weinehall March 24th, 1999 - * - Fixed the output of 'Driver Installed' in /proc/mca/pos - * - Made the Integrated Video & SCSI show up even if they have id 0000 - * - * Alexander Viro November 9th, 1999 - * - Switched to regular procfs methods - * - * Alfred Arnold & David Weinehall August 23rd, 2000 - * - Added support for Planar POS-registers - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned char which_scsi; - -int MCA_bus; -EXPORT_SYMBOL(MCA_bus); - -/* - * Motherboard register spinlock. Untested on SMP at the moment, but - * are there any MCA SMP boxes? - * - * Yes - Alan - */ -static DEFINE_SPINLOCK(mca_lock); - -/* Build the status info for the adapter */ - -static void mca_configure_adapter_status(struct mca_device *mca_dev) -{ - mca_dev->status = MCA_ADAPTER_NONE; - - mca_dev->pos_id = mca_dev->pos[0] - + (mca_dev->pos[1] << 8); - - if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) { - - /* - * id = 0x0000 usually indicates hardware failure, - * however, ZP Gu (zpg@castle.net> reports that his 9556 - * has 0x0000 as id and everything still works. There - * also seem to be an adapter with id = 0x0000; the - * NCR Parallel Bus Memory Card. Until this is confirmed, - * however, this code will stay. - */ - - mca_dev->status = MCA_ADAPTER_ERROR; - - return; - } else if (mca_dev->pos_id != 0xffff) { - - /* - * 0xffff usually indicates that there's no adapter, - * however, some integrated adapters may have 0xffff as - * their id and still be valid. Examples are on-board - * VGA of the 55sx, the integrated SCSI of the 56 & 57, - * and possibly also the 95 ULTIMEDIA. - */ - - mca_dev->status = MCA_ADAPTER_NORMAL; - } - - if ((mca_dev->pos_id == 0xffff || - mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) { - int j; - - for (j = 2; j < 8; j++) { - if (mca_dev->pos[j] != 0xff) { - mca_dev->status = MCA_ADAPTER_NORMAL; - break; - } - } - } - - if (!(mca_dev->pos[2] & MCA_ENABLED)) { - - /* enabled bit is in POS 2 */ - - mca_dev->status = MCA_ADAPTER_DISABLED; - } -} /* mca_configure_adapter_status */ - -/*--------------------------------------------------------------------*/ - -static struct resource mca_standard_resources[] = { - { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" }, - { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" }, - { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" }, - { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" }, - { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" }, - { .start = 0x96, .end = 0x97, .name = "POS (MCA)" }, - { .start = 0x100, .end = 0x107, .name = "POS (MCA)" } -}; - -#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources) - -/* - * mca_read_and_store_pos - read the POS registers into a memory buffer - * @pos: a char pointer to 8 bytes, contains the POS register value on - * successful return - * - * Returns 1 if a card actually exists (i.e. the pos isn't - * all 0xff) or 0 otherwise - */ -static int mca_read_and_store_pos(unsigned char *pos) -{ - int j; - int found = 0; - - for (j = 0; j < 8; j++) { - pos[j] = inb_p(MCA_POS_REG(j)); - if (pos[j] != 0xff) { - /* 0xff all across means no device. 0x00 means - * something's broken, but a device is - * probably there. However, if you get 0x00 - * from a motherboard register it won't matter - * what we find. For the record, on the - * 57SLC, the integrated SCSI adapter has - * 0xffff for the adapter ID, but nonzero for - * other registers. */ - - found = 1; - } - } - return found; -} - -static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg) -{ - unsigned char byte; - unsigned long flags; - - if (reg < 0 || reg >= 8) - return 0; - - spin_lock_irqsave(&mca_lock, flags); - if (mca_dev->pos_register) { - /* Disable adapter setup, enable motherboard setup */ - - outb_p(0, MCA_ADAPTER_SETUP_REG); - outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); - - byte = inb_p(MCA_POS_REG(reg)); - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - } else { - - /* Make sure motherboard setup is off */ - - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - - /* Read the appropriate register */ - - outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG); - byte = inb_p(MCA_POS_REG(reg)); - outb_p(0, MCA_ADAPTER_SETUP_REG); - } - spin_unlock_irqrestore(&mca_lock, flags); - - mca_dev->pos[reg] = byte; - - return byte; -} - -static void mca_pc_write_pos(struct mca_device *mca_dev, int reg, - unsigned char byte) -{ - unsigned long flags; - - if (reg < 0 || reg >= 8) - return; - - spin_lock_irqsave(&mca_lock, flags); - - /* Make sure motherboard setup is off */ - - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - - /* Read in the appropriate register */ - - outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG); - outb_p(byte, MCA_POS_REG(reg)); - outb_p(0, MCA_ADAPTER_SETUP_REG); - - spin_unlock_irqrestore(&mca_lock, flags); - - /* Update the global register list, while we have the byte */ - - mca_dev->pos[reg] = byte; - -} - -/* for the primary MCA bus, we have identity transforms */ -static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq) -{ - return irq; -} - -static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port) -{ - return port; -} - -static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem) -{ - return mem; -} - - -static int __init mca_init(void) -{ - unsigned int i, j; - struct mca_device *mca_dev; - unsigned char pos[8]; - short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00}; - struct mca_bus *bus; - - /* - * WARNING: Be careful when making changes here. Putting an adapter - * and the motherboard simultaneously into setup mode may result in - * damage to chips (according to The Indispensable PC Hardware Book - * by Hans-Peter Messmer). Also, we disable system interrupts (so - * that we are not disturbed in the middle of this). - */ - - /* Make sure the MCA bus is present */ - - if (mca_system_init()) { - printk(KERN_ERR "MCA bus system initialisation failed\n"); - return -ENODEV; - } - - if (!MCA_bus) - return -ENODEV; - - printk(KERN_INFO "Micro Channel bus detected.\n"); - - /* All MCA systems have at least a primary bus */ - bus = mca_attach_bus(MCA_PRIMARY_BUS); - if (!bus) - goto out_nomem; - bus->default_dma_mask = 0xffffffffLL; - bus->f.mca_write_pos = mca_pc_write_pos; - bus->f.mca_read_pos = mca_pc_read_pos; - bus->f.mca_transform_irq = mca_dummy_transform_irq; - bus->f.mca_transform_ioport = mca_dummy_transform_ioport; - bus->f.mca_transform_memory = mca_dummy_transform_memory; - - /* get the motherboard device */ - mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); - if (unlikely(!mca_dev)) - goto out_nomem; - - /* - * We do not expect many MCA interrupts during initialization, - * but let us be safe: - */ - spin_lock_irq(&mca_lock); - - /* Make sure adapter setup is off */ - - outb_p(0, MCA_ADAPTER_SETUP_REG); - - /* Read motherboard POS registers */ - - mca_dev->pos_register = 0x7f; - outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); - mca_dev->name[0] = 0; - mca_read_and_store_pos(mca_dev->pos); - mca_configure_adapter_status(mca_dev); - /* fake POS and slot for a motherboard */ - mca_dev->pos_id = MCA_MOTHERBOARD_POS; - mca_dev->slot = MCA_MOTHERBOARD; - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - - mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); - if (unlikely(!mca_dev)) - goto out_unlock_nomem; - - /* Put motherboard into video setup mode, read integrated video - * POS registers, and turn motherboard setup off. - */ - - mca_dev->pos_register = 0xdf; - outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); - mca_dev->name[0] = 0; - mca_read_and_store_pos(mca_dev->pos); - mca_configure_adapter_status(mca_dev); - /* fake POS and slot for the integrated video */ - mca_dev->pos_id = MCA_INTEGVIDEO_POS; - mca_dev->slot = MCA_INTEGVIDEO; - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - - /* - * Put motherboard into scsi setup mode, read integrated scsi - * POS registers, and turn motherboard setup off. - * - * It seems there are two possible SCSI registers. Martin says that - * for the 56,57, 0xf7 is the one, but fails on the 76. - * Alfredo (apena@vnet.ibm.com) says - * 0xfd works on his machine. We'll try both of them. I figure it's - * a good bet that only one could be valid at a time. This could - * screw up though if one is used for something else on the other - * machine. - */ - - for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) { - outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG); - if (mca_read_and_store_pos(pos)) - break; - } - if (which_scsi) { - /* found a scsi card */ - mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); - if (unlikely(!mca_dev)) - goto out_unlock_nomem; - - for (j = 0; j < 8; j++) - mca_dev->pos[j] = pos[j]; - - mca_configure_adapter_status(mca_dev); - /* fake POS and slot for integrated SCSI controller */ - mca_dev->pos_id = MCA_INTEGSCSI_POS; - mca_dev->slot = MCA_INTEGSCSI; - mca_dev->pos_register = which_scsi; - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - } - - /* Turn off motherboard setup */ - - outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); - - /* - * Now loop over MCA slots: put each adapter into setup mode, and - * read its POS registers. Then put adapter setup off. - */ - - for (i = 0; i < MCA_MAX_SLOT_NR; i++) { - outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG); - if (!mca_read_and_store_pos(pos)) - continue; - - mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); - if (unlikely(!mca_dev)) - goto out_unlock_nomem; - - for (j = 0; j < 8; j++) - mca_dev->pos[j] = pos[j]; - - mca_dev->driver_loaded = 0; - mca_dev->slot = i; - mca_dev->pos_register = 0; - mca_configure_adapter_status(mca_dev); - mca_register_device(MCA_PRIMARY_BUS, mca_dev); - } - outb_p(0, MCA_ADAPTER_SETUP_REG); - - /* Enable interrupts and return memory start */ - spin_unlock_irq(&mca_lock); - - for (i = 0; i < MCA_STANDARD_RESOURCES; i++) - request_resource(&ioport_resource, mca_standard_resources + i); - - mca_do_proc_init(); - - return 0; - - out_unlock_nomem: - spin_unlock_irq(&mca_lock); - out_nomem: - printk(KERN_EMERG "Failed memory allocation in MCA setup!\n"); - return -ENOMEM; -} - -subsys_initcall(mca_init); - -/*--------------------------------------------------------------------*/ - -static __kprobes void -mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) -{ - int slot = mca_dev->slot; - - if (slot == MCA_INTEGSCSI) { - printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n", - mca_dev->name); - } else if (slot == MCA_INTEGVIDEO) { - printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n", - mca_dev->name); - } else if (slot == MCA_MOTHERBOARD) { - printk(KERN_CRIT "NMI: caused by motherboard (%s)\n", - mca_dev->name); - } - - /* More info available in POS 6 and 7? */ - - if (check_flag) { - unsigned char pos6, pos7; - - pos6 = mca_device_read_pos(mca_dev, 6); - pos7 = mca_device_read_pos(mca_dev, 7); - - printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7); - } - -} /* mca_handle_nmi_slot */ - -/*--------------------------------------------------------------------*/ - -static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) -{ - struct mca_device *mca_dev = to_mca_device(dev); - unsigned char pos5; - - pos5 = mca_device_read_pos(mca_dev, 5); - - if (!(pos5 & 0x80)) { - /* - * Bit 7 of POS 5 is reset when this adapter has a hardware - * error. Bit 7 it reset if there's error information - * available in POS 6 and 7. - */ - mca_handle_nmi_device(mca_dev, !(pos5 & 0x40)); - return 1; - } - return 0; -} - -void __kprobes mca_handle_nmi(void) -{ - /* - * First try - scan the various adapters and see if a specific - * adapter was responsible for the error. - */ - bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); -} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index ca470e4c92dc..b02d4dd6b8a3 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -97,7 +97,7 @@ static void __init MP_bus_info(struct mpc_bus *m) set_bit(m->busid, mp_bus_not_pci); if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { @@ -105,12 +105,10 @@ static void __init MP_bus_info(struct mpc_bus *m) x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +#ifdef CONFIG_EISA mp_bus_id_to_type[m->busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { mp_bus_id_to_type[m->busid] = MP_BUS_EISA; - } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { - mp_bus_id_to_type[m->busid] = MP_BUS_MCA; #endif } else printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); @@ -368,9 +366,6 @@ static void __init construct_ioapic_table(int mpc_default_type) case 3: memcpy(bus.bustype, "EISA ", 6); break; - case 4: - case 7: - memcpy(bus.bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { @@ -623,7 +618,7 @@ void __init default_find_smp_config(void) return; /* * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an + * configuration is in an EISA bus machine with an * extended bios data area. * * there is a real-mode segmented pointer pointing to the diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf319165..7b3fdfdabf94 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -19,8 +19,6 @@ #include #include -#include - #if defined(CONFIG_EDAC) #include #endif @@ -282,16 +280,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) __this_cpu_add(nmi_stats.unknown, 1); -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", reason, smp_processor_id()); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..879166402bf9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -179,12 +178,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; EXPORT_SYMBOL(boot_cpu_data); -static void set_mca_bus(int x) -{ -#ifdef CONFIG_MCA - MCA_bus = x; -#endif -} unsigned int def_to_bigsmp; @@ -717,7 +710,6 @@ void __init setup_arch(char **cmdline_p) apm_info.bios = boot_params.apm_bios_info; ist_info = boot_params.ist_info; if (boot_params.sys_desc_table.length != 0) { - set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); machine_id = boot_params.sys_desc_table.table[0]; machine_submodel_id = boot_params.sys_desc_table.table[1]; BIOS_revision = boot_params.sys_desc_table.table[2]; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index c6eba2b42673..24d3c91e9812 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include @@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc); static irqreturn_t timer_interrupt(int irq, void *dev_id) { global_clock_event->event_handler(global_clock_event); - - /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ - if (MCA_bus) - outb_p(inb_p(0x61)| 0x80, 0x61); - return IRQ_HANDLED; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff9281f16029..4754f510b360 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,6 @@ #include #endif -#ifdef CONFIG_MCA -#include -#endif - #if defined(CONFIG_EDAC) #include #endif diff --git a/drivers/Makefile b/drivers/Makefile index 95952c82bf16..f9b82f2c7c47 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -92,7 +92,6 @@ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_ACCESSIBILITY) += accessibility/ obj-$(CONFIG_ISDN) += isdn/ obj-$(CONFIG_EDAC) += edac/ -obj-$(CONFIG_MCA) += mca/ obj-$(CONFIG_EISA) += eisa/ obj-y += lguest/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff --git a/drivers/mca/Kconfig b/drivers/mca/Kconfig deleted file mode 100644 index a7a0220ab4bd..000000000000 --- a/drivers/mca/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config MCA_LEGACY - bool "Legacy MCA API Support" - depends on MCA - help - This compiles in support for the old slot based MCA API. If you - have an unconverted MCA driver, you will need to say Y here. It - is safe to say Y anyway. - -config MCA_PROC_FS - bool "Support for the mca entry in /proc" - depends on MCA_LEGACY && PROC_FS - help - If you want the old style /proc/mca directory in addition to the - new style sysfs say Y here. diff --git a/drivers/mca/Makefile b/drivers/mca/Makefile deleted file mode 100644 index 0794b122520e..000000000000 --- a/drivers/mca/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# Makefile for the Linux MCA bus support - -obj-y := mca-bus.o mca-device.o mca-driver.o - -obj-$(CONFIG_MCA_PROC_FS) += mca-proc.o -obj-$(CONFIG_MCA_LEGACY) += mca-legacy.o - diff --git a/drivers/mca/mca-bus.c b/drivers/mca/mca-bus.c deleted file mode 100644 index ada5ebbaa255..000000000000 --- a/drivers/mca/mca-bus.c +++ /dev/null @@ -1,169 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* - * MCA bus support functions for sysfs. - * - * (C) 2002 James Bottomley - * -**----------------------------------------------------------------------------- -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 2 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -** -**----------------------------------------------------------------------------- - */ - -#include -#include -#include -#include -#include -#include - -/* Very few machines have more than one MCA bus. However, there are - * those that do (Voyager 35xx/5xxx), so we do it this way for future - * expansion. None that I know have more than 2 */ -static struct mca_bus *mca_root_busses[MAX_MCA_BUSSES]; - -#define MCA_DEVINFO(i,s) { .pos = i, .name = s } - -struct mca_device_info { - short pos_id; /* the 2 byte pos id for this card */ - char name[50]; -}; - -static int mca_bus_match (struct device *dev, struct device_driver *drv) -{ - struct mca_device *mca_dev = to_mca_device (dev); - struct mca_driver *mca_drv = to_mca_driver (drv); - const unsigned short *mca_ids = mca_drv->id_table; - int i = 0; - - if (mca_ids) { - for(i = 0; mca_ids[i]; i++) { - if (mca_ids[i] == mca_dev->pos_id) { - mca_dev->index = i; - return 1; - } - } - } - /* If the integrated id is present, treat it as though it were an - * additional id in the id_table (it can't be because by definition, - * integrated id's overflow a short */ - if (mca_drv->integrated_id && mca_dev->pos_id == - mca_drv->integrated_id) { - mca_dev->index = i; - return 1; - } - return 0; -} - -struct bus_type mca_bus_type = { - .name = "MCA", - .match = mca_bus_match, -}; -EXPORT_SYMBOL (mca_bus_type); - -static ssize_t mca_show_pos_id(struct device *dev, struct device_attribute *attr, char *buf) -{ - /* four digits, \n and trailing \0 */ - struct mca_device *mca_dev = to_mca_device(dev); - int len; - - if(mca_dev->pos_id < MCA_DUMMY_POS_START) - len = sprintf(buf, "%04x\n", mca_dev->pos_id); - else - len = sprintf(buf, "none\n"); - return len; -} -static ssize_t mca_show_pos(struct device *dev, struct device_attribute *attr, char *buf) -{ - /* enough for 8 two byte hex chars plus space and new line */ - int j, len=0; - struct mca_device *mca_dev = to_mca_device(dev); - - for(j=0; j<8; j++) - len += sprintf(buf+len, "%02x ", mca_dev->pos[j]); - /* change last trailing space to new line */ - buf[len-1] = '\n'; - return len; -} - -static DEVICE_ATTR(id, S_IRUGO, mca_show_pos_id, NULL); -static DEVICE_ATTR(pos, S_IRUGO, mca_show_pos, NULL); - -int __init mca_register_device(int bus, struct mca_device *mca_dev) -{ - struct mca_bus *mca_bus = mca_root_busses[bus]; - int rc; - - mca_dev->dev.parent = &mca_bus->dev; - mca_dev->dev.bus = &mca_bus_type; - dev_set_name(&mca_dev->dev, "%02d:%02X", bus, mca_dev->slot); - mca_dev->dma_mask = mca_bus->default_dma_mask; - mca_dev->dev.dma_mask = &mca_dev->dma_mask; - mca_dev->dev.coherent_dma_mask = mca_dev->dma_mask; - - rc = device_register(&mca_dev->dev); - if (rc) - goto err_out; - - rc = device_create_file(&mca_dev->dev, &dev_attr_id); - if (rc) goto err_out_devreg; - rc = device_create_file(&mca_dev->dev, &dev_attr_pos); - if (rc) goto err_out_id; - - return 1; - -err_out_id: - device_remove_file(&mca_dev->dev, &dev_attr_id); -err_out_devreg: - device_unregister(&mca_dev->dev); -err_out: - return 0; -} - -/* */ -struct mca_bus * __devinit mca_attach_bus(int bus) -{ - struct mca_bus *mca_bus; - - if (unlikely(mca_root_busses[bus] != NULL)) { - /* This should never happen, but just in case */ - printk(KERN_EMERG "MCA tried to add already existing bus %d\n", - bus); - dump_stack(); - return NULL; - } - - mca_bus = kzalloc(sizeof(struct mca_bus), GFP_KERNEL); - if (!mca_bus) - return NULL; - - dev_set_name(&mca_bus->dev, "mca%d", bus); - sprintf(mca_bus->name,"Host %s MCA Bridge", bus ? "Secondary" : "Primary"); - if (device_register(&mca_bus->dev)) { - kfree(mca_bus); - return NULL; - } - - mca_root_busses[bus] = mca_bus; - - return mca_bus; -} - -int __init mca_system_init (void) -{ - return bus_register(&mca_bus_type); -} diff --git a/drivers/mca/mca-device.c b/drivers/mca/mca-device.c deleted file mode 100644 index e7adf89fae41..000000000000 --- a/drivers/mca/mca-device.c +++ /dev/null @@ -1,218 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* - * MCA device support functions - * - * These functions support the ongoing device access API. - * - * (C) 2002 James Bottomley - * -**----------------------------------------------------------------------------- -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 2 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -** -**----------------------------------------------------------------------------- - */ - -#include -#include -#include -#include - -/** - * mca_device_read_stored_pos - read POS register from stored data - * @mca_dev: device to read from - * @reg: register to read from - * - * Fetch a POS value that was stored at boot time by the kernel - * when it scanned the MCA space. The register value is returned. - * Missing or invalid registers report 0. - */ -unsigned char mca_device_read_stored_pos(struct mca_device *mca_dev, int reg) -{ - if(reg < 0 || reg >= 8) - return 0; - - return mca_dev->pos[reg]; -} -EXPORT_SYMBOL(mca_device_read_stored_pos); - -/** - * mca_device_read_pos - read POS register from card - * @mca_dev: device to read from - * @reg: register to read from - * - * Fetch a POS value directly from the hardware to obtain the - * current value. This is much slower than - * mca_device_read_stored_pos and may not be invoked from - * interrupt context. It handles the deep magic required for - * onboard devices transparently. - */ -unsigned char mca_device_read_pos(struct mca_device *mca_dev, int reg) -{ - struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent); - - return mca_bus->f.mca_read_pos(mca_dev, reg); - - return mca_dev->pos[reg]; -} -EXPORT_SYMBOL(mca_device_read_pos); - - -/** - * mca_device_write_pos - read POS register from card - * @mca_dev: device to write pos register to - * @reg: register to write to - * @byte: byte to write to the POS registers - * - * Store a POS value directly to the hardware. You should not - * normally need to use this function and should have a very good - * knowledge of MCA bus before you do so. Doing this wrongly can - * damage the hardware. - * - * This function may not be used from interrupt context. - * - */ -void mca_device_write_pos(struct mca_device *mca_dev, int reg, - unsigned char byte) -{ - struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent); - - mca_bus->f.mca_write_pos(mca_dev, reg, byte); -} -EXPORT_SYMBOL(mca_device_write_pos); - -/** - * mca_device_transform_irq - transform the ADF obtained IRQ - * @mca_device: device whose irq needs transforming - * @irq: input irq from ADF - * - * MCA Adapter Definition Files (ADF) contain irq, ioport, memory - * etc. definitions. In systems with more than one bus, these need - * to be transformed through bus mapping functions to get the real - * system global quantities. - * - * This function transforms the interrupt number and returns the - * transformed system global interrupt - */ -int mca_device_transform_irq(struct mca_device *mca_dev, int irq) -{ - struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent); - - return mca_bus->f.mca_transform_irq(mca_dev, irq); -} -EXPORT_SYMBOL(mca_device_transform_irq); - -/** - * mca_device_transform_ioport - transform the ADF obtained I/O port - * @mca_device: device whose port needs transforming - * @ioport: input I/O port from ADF - * - * MCA Adapter Definition Files (ADF) contain irq, ioport, memory - * etc. definitions. In systems with more than one bus, these need - * to be transformed through bus mapping functions to get the real - * system global quantities. - * - * This function transforms the I/O port number and returns the - * transformed system global port number. - * - * This transformation can be assumed to be linear for port ranges. - */ -int mca_device_transform_ioport(struct mca_device *mca_dev, int port) -{ - struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent); - - return mca_bus->f.mca_transform_ioport(mca_dev, port); -} -EXPORT_SYMBOL(mca_device_transform_ioport); - -/** - * mca_device_transform_memory - transform the ADF obtained memory - * @mca_device: device whose memory region needs transforming - * @mem: memory region start from ADF - * - * MCA Adapter Definition Files (ADF) contain irq, ioport, memory - * etc. definitions. In systems with more than one bus, these need - * to be transformed through bus mapping functions to get the real - * system global quantities. - * - * This function transforms the memory region start and returns the - * transformed system global memory region (physical). - * - * This transformation can be assumed to be linear for region ranges. - */ -void *mca_device_transform_memory(struct mca_device *mca_dev, void *mem) -{ - struct mca_bus *mca_bus = to_mca_bus(mca_dev->dev.parent); - - return mca_bus->f.mca_transform_memory(mca_dev, mem); -} -EXPORT_SYMBOL(mca_device_transform_memory); - - -/** - * mca_device_claimed - check if claimed by driver - * @mca_dev: device to check - * - * Returns 1 if the slot has been claimed by a driver - */ - -int mca_device_claimed(struct mca_device *mca_dev) -{ - return mca_dev->driver_loaded; -} -EXPORT_SYMBOL(mca_device_claimed); - -/** - * mca_device_set_claim - set the claim value of the driver - * @mca_dev: device to set value for - * @val: claim value to set (1 claimed, 0 unclaimed) - */ -void mca_device_set_claim(struct mca_device *mca_dev, int val) -{ - mca_dev->driver_loaded = val; -} -EXPORT_SYMBOL(mca_device_set_claim); - -/** - * mca_device_status - get the status of the device - * @mca_device: device to get - * - * returns an enumeration of the device status: - * - * MCA_ADAPTER_NORMAL adapter is OK. - * MCA_ADAPTER_NONE no adapter at device (should never happen). - * MCA_ADAPTER_DISABLED adapter is disabled. - * MCA_ADAPTER_ERROR adapter cannot be initialised. - */ -enum MCA_AdapterStatus mca_device_status(struct mca_device *mca_dev) -{ - return mca_dev->status; -} -EXPORT_SYMBOL(mca_device_status); - -/** - * mca_device_set_name - set the name of the device - * @mca_device: device to set the name of - * @name: name to set - */ -void mca_device_set_name(struct mca_device *mca_dev, const char *name) -{ - if(!mca_dev) - return; - - strlcpy(mca_dev->name, name, sizeof(mca_dev->name)); -} -EXPORT_SYMBOL(mca_device_set_name); diff --git a/drivers/mca/mca-driver.c b/drivers/mca/mca-driver.c deleted file mode 100644 index 32cd39bcc715..000000000000 --- a/drivers/mca/mca-driver.c +++ /dev/null @@ -1,63 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* - * MCA driver support functions for sysfs. - * - * (C) 2002 James Bottomley - * -**----------------------------------------------------------------------------- -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 2 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -** -**----------------------------------------------------------------------------- - */ - -#include -#include -#include - -int mca_register_driver(struct mca_driver *mca_drv) -{ - int r; - - if (MCA_bus) { - mca_drv->driver.bus = &mca_bus_type; - if ((r = driver_register(&mca_drv->driver)) < 0) - return r; - mca_drv->integrated_id = 0; - } - - return 0; -} -EXPORT_SYMBOL(mca_register_driver); - -int mca_register_driver_integrated(struct mca_driver *mca_driver, - int integrated_id) -{ - int r = mca_register_driver(mca_driver); - - if (!r) - mca_driver->integrated_id = integrated_id; - - return r; -} -EXPORT_SYMBOL(mca_register_driver_integrated); - -void mca_unregister_driver(struct mca_driver *mca_drv) -{ - if (MCA_bus) - driver_unregister(&mca_drv->driver); -} -EXPORT_SYMBOL(mca_unregister_driver); diff --git a/drivers/mca/mca-legacy.c b/drivers/mca/mca-legacy.c deleted file mode 100644 index 494f0c2001f5..000000000000 --- a/drivers/mca/mca-legacy.c +++ /dev/null @@ -1,329 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* - * MCA bus support functions for legacy (2.4) API. - * - * Legacy API means the API that operates in terms of MCA slot number - * - * (C) 2002 James Bottomley - * -**----------------------------------------------------------------------------- -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 2 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -** -**----------------------------------------------------------------------------- - */ - -#include -#include -#include -#include - -/* NOTE: This structure is stack allocated */ -struct mca_find_adapter_info { - int id; - int slot; - struct mca_device *mca_dev; -}; - -/* The purpose of this iterator is to loop over all the devices and - * find the one with the smallest slot number that's just greater than - * or equal to the required slot with a matching id */ -static int mca_find_adapter_callback(struct device *dev, void *data) -{ - struct mca_find_adapter_info *info = data; - struct mca_device *mca_dev = to_mca_device(dev); - - if(mca_dev->pos_id != info->id) - return 0; - - if(mca_dev->slot < info->slot) - return 0; - - if(!info->mca_dev || info->mca_dev->slot >= mca_dev->slot) - info->mca_dev = mca_dev; - - return 0; -} - -/** - * mca_find_adapter - scan for adapters - * @id: MCA identification to search for - * @start: starting slot - * - * Search the MCA configuration for adapters matching the 16bit - * ID given. The first time it should be called with start as zero - * and then further calls made passing the return value of the - * previous call until %MCA_NOTFOUND is returned. - * - * Disabled adapters are not reported. - */ - -int mca_find_adapter(int id, int start) -{ - struct mca_find_adapter_info info; - - if(id == 0xffff) - return MCA_NOTFOUND; - - info.slot = start; - info.id = id; - info.mca_dev = NULL; - - for(;;) { - bus_for_each_dev(&mca_bus_type, NULL, &info, mca_find_adapter_callback); - - if(info.mca_dev == NULL) - return MCA_NOTFOUND; - - if(info.mca_dev->status != MCA_ADAPTER_DISABLED) - break; - - /* OK, found adapter but it was disabled. Go around - * again, excluding the slot we just found */ - - info.slot = info.mca_dev->slot + 1; - info.mca_dev = NULL; - } - - return info.mca_dev->slot; -} -EXPORT_SYMBOL(mca_find_adapter); - -/*--------------------------------------------------------------------*/ - -/** - * mca_find_unused_adapter - scan for unused adapters - * @id: MCA identification to search for - * @start: starting slot - * - * Search the MCA configuration for adapters matching the 16bit - * ID given. The first time it should be called with start as zero - * and then further calls made passing the return value of the - * previous call until %MCA_NOTFOUND is returned. - * - * Adapters that have been claimed by drivers and those that - * are disabled are not reported. This function thus allows a driver - * to scan for further cards when some may already be driven. - */ - -int mca_find_unused_adapter(int id, int start) -{ - struct mca_find_adapter_info info = { 0 }; - - if (!MCA_bus || id == 0xffff) - return MCA_NOTFOUND; - - info.slot = start; - info.id = id; - info.mca_dev = NULL; - - for(;;) { - bus_for_each_dev(&mca_bus_type, NULL, &info, mca_find_adapter_callback); - - if(info.mca_dev == NULL) - return MCA_NOTFOUND; - - if(info.mca_dev->status != MCA_ADAPTER_DISABLED - && !info.mca_dev->driver_loaded) - break; - - /* OK, found adapter but it was disabled or already in - * use. Go around again, excluding the slot we just - * found */ - - info.slot = info.mca_dev->slot + 1; - info.mca_dev = NULL; - } - - return info.mca_dev->slot; -} -EXPORT_SYMBOL(mca_find_unused_adapter); - -/* NOTE: stack allocated structure */ -struct mca_find_device_by_slot_info { - int slot; - struct mca_device *mca_dev; -}; - -static int mca_find_device_by_slot_callback(struct device *dev, void *data) -{ - struct mca_find_device_by_slot_info *info = data; - struct mca_device *mca_dev = to_mca_device(dev); - - if(mca_dev->slot == info->slot) - info->mca_dev = mca_dev; - - return 0; -} - -struct mca_device *mca_find_device_by_slot(int slot) -{ - struct mca_find_device_by_slot_info info; - - info.slot = slot; - info.mca_dev = NULL; - - bus_for_each_dev(&mca_bus_type, NULL, &info, mca_find_device_by_slot_callback); - - return info.mca_dev; -} - -/** - * mca_read_stored_pos - read POS register from boot data - * @slot: slot number to read from - * @reg: register to read from - * - * Fetch a POS value that was stored at boot time by the kernel - * when it scanned the MCA space. The register value is returned. - * Missing or invalid registers report 0. - */ -unsigned char mca_read_stored_pos(int slot, int reg) -{ - struct mca_device *mca_dev = mca_find_device_by_slot(slot); - - if(!mca_dev) - return 0; - - return mca_device_read_stored_pos(mca_dev, reg); -} -EXPORT_SYMBOL(mca_read_stored_pos); - - -/** - * mca_read_pos - read POS register from card - * @slot: slot number to read from - * @reg: register to read from - * - * Fetch a POS value directly from the hardware to obtain the - * current value. This is much slower than mca_read_stored_pos and - * may not be invoked from interrupt context. It handles the - * deep magic required for onboard devices transparently. - */ - -unsigned char mca_read_pos(int slot, int reg) -{ - struct mca_device *mca_dev = mca_find_device_by_slot(slot); - - if(!mca_dev) - return 0; - - return mca_device_read_pos(mca_dev, reg); -} -EXPORT_SYMBOL(mca_read_pos); - - -/** - * mca_write_pos - read POS register from card - * @slot: slot number to read from - * @reg: register to read from - * @byte: byte to write to the POS registers - * - * Store a POS value directly from the hardware. You should not - * normally need to use this function and should have a very good - * knowledge of MCA bus before you do so. Doing this wrongly can - * damage the hardware. - * - * This function may not be used from interrupt context. - * - * Note that this a technically a Bad Thing, as IBM tech stuff says - * you should only set POS values through their utilities. - * However, some devices such as the 3c523 recommend that you write - * back some data to make sure the configuration is consistent. - * I'd say that IBM is right, but I like my drivers to work. - * - * This function can't do checks to see if multiple devices end up - * with the same resources, so you might see magic smoke if someone - * screws up. - */ - -void mca_write_pos(int slot, int reg, unsigned char byte) -{ - struct mca_device *mca_dev = mca_find_device_by_slot(slot); - - if(!mca_dev) - return; - - mca_device_write_pos(mca_dev, reg, byte); -} -EXPORT_SYMBOL(mca_write_pos); - -/** - * mca_set_adapter_name - Set the description of the card - * @slot: slot to name - * @name: text string for the namen - * - * This function sets the name reported via /proc for this - * adapter slot. This is for user information only. Setting a - * name deletes any previous name. - */ - -void mca_set_adapter_name(int slot, char* name) -{ - struct mca_device *mca_dev = mca_find_device_by_slot(slot); - - if(!mca_dev) - return; - - mca_device_set_name(mca_dev, name); -} -EXPORT_SYMBOL(mca_set_adapter_name); - -/** - * mca_mark_as_used - claim an MCA device - * @slot: slot to claim - * FIXME: should we make this threadsafe - * - * Claim an MCA slot for a device driver. If the - * slot is already taken the function returns 1, - * if it is not taken it is claimed and 0 is - * returned. - */ - -int mca_mark_as_used(int slot) -{ - struct mca_device *mca_dev = mca_find_device_by_slot(slot); - - if(!mca_dev) - /* FIXME: this is actually a severe error */ - return 1; - - if(mca_device_claimed(mca_dev)) - return 1; - - mca_device_set_claim(mca_dev, 1); - - return 0; -} -EXPORT_SYMBOL(mca_mark_as_used); - -/** - * mca_mark_as_unused - release an MCA device - * @slot: slot to claim - * - * Release the slot for other drives to use. - */ - -void mca_mark_as_unused(int slot) -{ - struct mca_device *mca_dev = mca_find_device_by_slot(slot); - - if(!mca_dev) - return; - - mca_device_set_claim(mca_dev, 0); -} -EXPORT_SYMBOL(mca_mark_as_unused); - diff --git a/drivers/mca/mca-proc.c b/drivers/mca/mca-proc.c deleted file mode 100644 index 81ea0d377bf4..000000000000 --- a/drivers/mca/mca-proc.c +++ /dev/null @@ -1,249 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* - * MCA bus support functions for the proc fs. - * - * NOTE: this code *requires* the legacy MCA api. - * - * Legacy API means the API that operates in terms of MCA slot number - * - * (C) 2002 James Bottomley - * -**----------------------------------------------------------------------------- -** -** This program is free software; you can redistribute it and/or modify -** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 2 of the License, or -** (at your option) any later version. -** -** This program is distributed in the hope that it will be useful, -** but WITHOUT ANY WARRANTY; without even the implied warranty of -** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -** GNU General Public License for more details. -** -** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -** -**----------------------------------------------------------------------------- - */ -#include -#include -#include -#include - -static int get_mca_info_helper(struct mca_device *mca_dev, char *page, int len) -{ - int j; - - for(j=0; j<8; j++) - len += sprintf(page+len, "%02x ", - mca_dev ? mca_dev->pos[j] : 0xff); - len += sprintf(page+len, " %s\n", mca_dev ? mca_dev->name : ""); - return len; -} - -static int get_mca_info(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int i, len = 0; - - if(MCA_bus) { - struct mca_device *mca_dev; - /* Format POS registers of eight MCA slots */ - - for(i=0; icount) len = count; - if (len<0) len = 0; - return len; -} - -/*--------------------------------------------------------------------*/ - -static int mca_default_procfn(char* buf, struct mca_device *mca_dev) -{ - int len = 0, i; - int slot = mca_dev->slot; - - /* Print out the basic information */ - - if(slot < MCA_MAX_SLOT_NR) { - len += sprintf(buf+len, "Slot: %d\n", slot+1); - } else if(slot == MCA_INTEGSCSI) { - len += sprintf(buf+len, "Integrated SCSI Adapter\n"); - } else if(slot == MCA_INTEGVIDEO) { - len += sprintf(buf+len, "Integrated Video Adapter\n"); - } else if(slot == MCA_MOTHERBOARD) { - len += sprintf(buf+len, "Motherboard\n"); - } - if (mca_dev->name[0]) { - - /* Drivers might register a name without /proc handler... */ - - len += sprintf(buf+len, "Adapter Name: %s\n", - mca_dev->name); - } else { - len += sprintf(buf+len, "Adapter Name: Unknown\n"); - } - len += sprintf(buf+len, "Id: %02x%02x\n", - mca_dev->pos[1], mca_dev->pos[0]); - len += sprintf(buf+len, "Enabled: %s\nPOS: ", - mca_device_status(mca_dev) == MCA_ADAPTER_NORMAL ? - "Yes" : "No"); - for(i=0; i<8; i++) { - len += sprintf(buf+len, "%02x ", mca_dev->pos[i]); - } - len += sprintf(buf+len, "\nDriver Installed: %s", - mca_device_claimed(mca_dev) ? "Yes" : "No"); - buf[len++] = '\n'; - buf[len] = 0; - - return len; -} /* mca_default_procfn() */ - -static int get_mca_machine_info(char* page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len = 0; - - len += sprintf(page+len, "Model Id: 0x%x\n", machine_id); - len += sprintf(page+len, "Submodel Id: 0x%x\n", machine_submodel_id); - len += sprintf(page+len, "BIOS Revision: 0x%x\n", BIOS_revision); - - if (len <= off+count) *eof = 1; - *start = page + off; - len -= off; - if (len>count) len = count; - if (len<0) len = 0; - return len; -} - -static int mca_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct mca_device *mca_dev = (struct mca_device *)data; - int len = 0; - - /* Get the standard info */ - - len = mca_default_procfn(page, mca_dev); - - /* Do any device-specific processing, if there is any */ - - if(mca_dev->procfn) { - len += mca_dev->procfn(page+len, mca_dev->slot, - mca_dev->proc_dev); - } - if (len <= off+count) *eof = 1; - *start = page + off; - len -= off; - if (len>count) len = count; - if (len<0) len = 0; - return len; -} /* mca_read_proc() */ - -/*--------------------------------------------------------------------*/ - -void __init mca_do_proc_init(void) -{ - int i; - struct proc_dir_entry *proc_mca; - struct proc_dir_entry* node = NULL; - struct mca_device *mca_dev; - - proc_mca = proc_mkdir("mca", NULL); - create_proc_read_entry("pos",0,proc_mca,get_mca_info,NULL); - create_proc_read_entry("machine",0,proc_mca,get_mca_machine_info,NULL); - - /* Initialize /proc/mca entries for existing adapters */ - - for(i = 0; i < MCA_NUMADAPTERS; i++) { - enum MCA_AdapterStatus status; - mca_dev = mca_find_device_by_slot(i); - if(!mca_dev) - continue; - - mca_dev->procfn = NULL; - - if(i < MCA_MAX_SLOT_NR) sprintf(mca_dev->procname,"slot%d", i+1); - else if(i == MCA_INTEGVIDEO) sprintf(mca_dev->procname,"video"); - else if(i == MCA_INTEGSCSI) sprintf(mca_dev->procname,"scsi"); - else if(i == MCA_MOTHERBOARD) sprintf(mca_dev->procname,"planar"); - - status = mca_device_status(mca_dev); - if (status != MCA_ADAPTER_NORMAL && - status != MCA_ADAPTER_DISABLED) - continue; - - node = create_proc_read_entry(mca_dev->procname, 0, proc_mca, - mca_read_proc, (void *)mca_dev); - - if(node == NULL) { - printk("Failed to allocate memory for MCA proc-entries!"); - return; - } - } - -} /* mca_do_proc_init() */ - -/** - * mca_set_adapter_procfn - Set the /proc callback - * @slot: slot to configure - * @procfn: callback function to call for /proc - * @dev: device information passed to the callback - * - * This sets up an information callback for /proc/mca/slot?. The - * function is called with the buffer, slot, and device pointer (or - * some equally informative context information, or nothing, if you - * prefer), and is expected to put useful information into the - * buffer. The adapter name, ID, and POS registers get printed - * before this is called though, so don't do it again. - * - * This should be called with a %NULL @procfn when a module - * unregisters, thus preventing kernel crashes and other such - * nastiness. - */ - -void mca_set_adapter_procfn(int slot, MCA_ProcFn procfn, void* proc_dev) -{ - struct mca_device *mca_dev = mca_find_device_by_slot(slot); - - if(!mca_dev) - return; - - mca_dev->procfn = procfn; - mca_dev->proc_dev = proc_dev; -} -EXPORT_SYMBOL(mca_set_adapter_procfn); diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c index 6d115c7208ab..506c36f6e1db 100644 --- a/drivers/message/i2o/i2o_proc.c +++ b/drivers/message/i2o/i2o_proc.c @@ -283,7 +283,6 @@ static char *bus_strings[] = { "Local Bus", "ISA", "EISA", - "MCA", "PCI", "PCMCIA", "NUBUS", @@ -351,18 +350,6 @@ static int i2o_seq_show_hrt(struct seq_file *seq, void *v) EisaSlotNumber); break; - case I2O_BUS_MCA: - seq_printf(seq, " IOBase: %0#6x,", - hrt->hrt_entry[i].bus.mca_bus. - McaBaseIOPort); - seq_printf(seq, " MemoryBase: %0#10x,", - hrt->hrt_entry[i].bus.mca_bus. - McaBaseMemoryAddress); - seq_printf(seq, " Slot: %0#4x,", - hrt->hrt_entry[i].bus.mca_bus. - McaSlotNumber); - break; - case I2O_BUS_PCI: seq_printf(seq, " Bus: %0#4x", hrt->hrt_entry[i].bus.pci_bus. diff --git a/include/linux/i2o-dev.h b/include/linux/i2o-dev.h index a0b23dd45239..a8093bfec3a6 100644 --- a/include/linux/i2o-dev.h +++ b/include/linux/i2o-dev.h @@ -124,7 +124,7 @@ typedef struct i2o_sg_io_hdr { #define I2O_BUS_LOCAL 0 #define I2O_BUS_ISA 1 #define I2O_BUS_EISA 2 -#define I2O_BUS_MCA 3 +/* was I2O_BUS_MCA 3 */ #define I2O_BUS_PCI 4 #define I2O_BUS_PCMCIA 5 #define I2O_BUS_NUBUS 6 diff --git a/include/linux/mca-legacy.h b/include/linux/mca-legacy.h deleted file mode 100644 index 7a3aea845902..000000000000 --- a/include/linux/mca-legacy.h +++ /dev/null @@ -1,66 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8 -*- */ - -/* This is the function prototypes for the old legacy MCA interface - * - * Please move your driver to the new sysfs based one instead */ - -#ifndef _LINUX_MCA_LEGACY_H -#define _LINUX_MCA_LEGACY_H - -#include - -#warning "MCA legacy - please move your driver to the new sysfs api" - -/* MCA_NOTFOUND is an error condition. The other two indicate - * motherboard POS registers contain the adapter. They might be - * returned by the mca_find_adapter() function, and can be used as - * arguments to mca_read_stored_pos(). I'm not going to allow direct - * access to the motherboard registers until we run across an adapter - * that requires it. We don't know enough about them to know if it's - * safe. - * - * See Documentation/mca.txt or one of the existing drivers for - * more information. - */ -#define MCA_NOTFOUND (-1) - - - -/* Returns the slot of the first enabled adapter matching id. User can - * specify a starting slot beyond zero, to deal with detecting multiple - * devices. Returns MCA_NOTFOUND if id not found. Also checks the - * integrated adapters. - */ -extern int mca_find_adapter(int id, int start); -extern int mca_find_unused_adapter(int id, int start); - -extern int mca_mark_as_used(int slot); -extern void mca_mark_as_unused(int slot); - -/* gets a byte out of POS register (stored in memory) */ -extern unsigned char mca_read_stored_pos(int slot, int reg); - -/* This can be expanded later. Right now, it gives us a way of - * getting meaningful information into the MCA_info structure, - * so we can have a more interesting /proc/mca. - */ -extern void mca_set_adapter_name(int slot, char* name); - -/* These routines actually mess with the hardware POS registers. They - * temporarily disable the device (and interrupts), so make sure you know - * what you're doing if you use them. Furthermore, writing to a POS may - * result in two devices trying to share a resource, which in turn can - * result in multiple devices sharing memory spaces, IRQs, or even trashing - * hardware. YOU HAVE BEEN WARNED. - * - * You can only access slots with this. Motherboard registers are off - * limits. - */ - -/* read a byte from the specified POS register. */ -extern unsigned char mca_read_pos(int slot, int reg); - -/* write a byte to the specified POS register. */ -extern void mca_write_pos(int slot, int reg, unsigned char byte); - -#endif diff --git a/include/linux/mca.h b/include/linux/mca.h deleted file mode 100644 index 37972704617f..000000000000 --- a/include/linux/mca.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Header for Microchannel Architecture Bus - * Written by Martin Kolinek, February 1996 - */ - -#ifndef _LINUX_MCA_H -#define _LINUX_MCA_H - -#include - -#ifdef CONFIG_MCA -#include - -extern int MCA_bus; -#else -#define MCA_bus 0 -#endif - -/* This sets up an information callback for /proc/mca/slot?. The - * function is called with the buffer, slot, and device pointer (or - * some equally informative context information, or nothing, if you - * prefer), and is expected to put useful information into the - * buffer. The adapter name, id, and POS registers get printed - * before this is called though, so don't do it again. - * - * This should be called with a NULL procfn when a module - * unregisters, thus preventing kernel crashes and other such - * nastiness. - */ -typedef int (*MCA_ProcFn)(char* buf, int slot, void* dev); - -/* Should only be called by the NMI interrupt handler, this will do some - * fancy stuff to figure out what might have generated a NMI. - */ -extern void mca_handle_nmi(void); - -enum MCA_AdapterStatus { - MCA_ADAPTER_NORMAL = 0, - MCA_ADAPTER_NONE = 1, - MCA_ADAPTER_DISABLED = 2, - MCA_ADAPTER_ERROR = 3 -}; - -struct mca_device { - u64 dma_mask; - int pos_id; - int slot; - - /* index into id_table, set by the bus match routine */ - int index; - - /* is there a driver installed? 0 - No, 1 - Yes */ - int driver_loaded; - /* POS registers */ - unsigned char pos[8]; - /* if a pseudo adapter of the motherboard, this is the motherboard - * register value to use for setup cycles */ - short pos_register; - - enum MCA_AdapterStatus status; -#ifdef CONFIG_MCA_PROC_FS - /* name of the proc/mca file */ - char procname[8]; - /* /proc info callback */ - MCA_ProcFn procfn; - /* device/context info for proc callback */ - void *proc_dev; -#endif - struct device dev; - char name[32]; -}; -#define to_mca_device(mdev) container_of(mdev, struct mca_device, dev) - -struct mca_bus_accessor_functions { - unsigned char (*mca_read_pos)(struct mca_device *, int reg); - void (*mca_write_pos)(struct mca_device *, int reg, - unsigned char byte); - int (*mca_transform_irq)(struct mca_device *, int irq); - int (*mca_transform_ioport)(struct mca_device *, - int region); - void * (*mca_transform_memory)(struct mca_device *, - void *memory); -}; - -struct mca_bus { - u64 default_dma_mask; - int number; - struct mca_bus_accessor_functions f; - struct device dev; - char name[32]; -}; -#define to_mca_bus(mdev) container_of(mdev, struct mca_bus, dev) - -struct mca_driver { - const short *id_table; - void *driver_data; - int integrated_id; - struct device_driver driver; -}; -#define to_mca_driver(mdriver) container_of(mdriver, struct mca_driver, driver) - -/* Ongoing supported API functions */ -extern struct mca_device *mca_find_device_by_slot(int slot); -extern int mca_system_init(void); -extern struct mca_bus *mca_attach_bus(int); - -extern unsigned char mca_device_read_stored_pos(struct mca_device *mca_dev, - int reg); -extern unsigned char mca_device_read_pos(struct mca_device *mca_dev, int reg); -extern void mca_device_write_pos(struct mca_device *mca_dev, int reg, - unsigned char byte); -extern int mca_device_transform_irq(struct mca_device *mca_dev, int irq); -extern int mca_device_transform_ioport(struct mca_device *mca_dev, int port); -extern void *mca_device_transform_memory(struct mca_device *mca_dev, - void *mem); -extern int mca_device_claimed(struct mca_device *mca_dev); -extern void mca_device_set_claim(struct mca_device *mca_dev, int val); -extern void mca_device_set_name(struct mca_device *mca_dev, const char *name); -static inline char *mca_device_get_name(struct mca_device *mca_dev) -{ - return mca_dev ? mca_dev->name : NULL; -} - -extern enum MCA_AdapterStatus mca_device_status(struct mca_device *mca_dev); - -extern struct bus_type mca_bus_type; - -extern int mca_register_driver(struct mca_driver *drv); -extern int mca_register_driver_integrated(struct mca_driver *, int); -extern void mca_unregister_driver(struct mca_driver *drv); - -/* WARNING: only called by the boot time device setup */ -extern int mca_register_device(int bus, struct mca_device *mca_dev); - -#ifdef CONFIG_MCA_PROC_FS -extern void mca_do_proc_init(void); -extern void mca_set_adapter_procfn(int slot, MCA_ProcFn, void* dev); -#else -static inline void mca_do_proc_init(void) -{ -} - -static inline void mca_set_adapter_procfn(int slot, MCA_ProcFn fn, void* dev) -{ -} -#endif - -#endif /* _LINUX_MCA_H */ diff --git a/scripts/kconfig/mconf.c b/scripts/kconfig/mconf.c index 2c6286c0bc1a..f606738d421d 100644 --- a/scripts/kconfig/mconf.c +++ b/scripts/kconfig/mconf.c @@ -240,7 +240,7 @@ search_help[] = N_( "Defined at drivers/pci/Kconfig:47\n" "Depends on: X86_LOCAL_APIC && X86_IO_APIC || IA64\n" "Location:\n" - " -> Bus options (PCI, PCMCIA, EISA, MCA, ISA)\n" + " -> Bus options (PCI, PCMCIA, EISA, ISA)\n" " -> PCI support (PCI [=y])\n" " -> PCI access mode ( [=y])\n" "Selects: LIBCRC32\n" diff --git a/scripts/kconfig/nconf.c b/scripts/kconfig/nconf.c index 73070cb0b6de..8c0eb65978c9 100644 --- a/scripts/kconfig/nconf.c +++ b/scripts/kconfig/nconf.c @@ -223,7 +223,7 @@ search_help[] = N_( "Defined at drivers/pci/Kconfig:47\n" "Depends on: X86_LOCAL_APIC && X86_IO_APIC || IA64\n" "Location:\n" -" -> Bus options (PCI, PCMCIA, EISA, MCA, ISA)\n" +" -> Bus options (PCI, PCMCIA, EISA, ISA)\n" " -> PCI support (PCI [ = y])\n" " -> PCI access mode ( [ = y])\n" "Selects: LIBCRC32\n" -- cgit v1.2.3 From bea3f8781e30d0abc0bd0da80aa528d44c71959e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 00:24:09 -0700 Subject: x86, relocs: Workaround for binutils 2.22.52.0.1 section bug GNU ld 2.22.52.0.1 has a bug that it blindly changes symbols from section-relative to absolute if they are in a section of zero length. This turns the symbols __init_begin and __init_end into absolute symbols. Let the relocs program know that those should be treated as relative symbols. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: H.J. Lu --- arch/x86/tools/relocs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 74e16bb15dc4..4df285450e8c 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -56,7 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { * as absolute (typically defined outside any section in the linker script.) */ [S_REL] = - "^_end$", + "^(__init_begin|__init_end|_end)$" }; -- cgit v1.2.3 From 87e4baacaeba098e217b14e9d2f6af428fe3f657 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 May 2012 09:34:45 +0200 Subject: x86/xen/apic: Add missing #include This file depends on , but the dependency was hidden due to: -> -> -> With the removal of , this exposed the missing Cc: Len Brown Cc: Konrad Rzeszutek Wilk Cc: Jeremy Fitzhardinge Cc: Jarkko Sakkinen Cc: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-7ccybvue6mw6wje3uxzzcglj@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/xen/apic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 1913bf2d2a9c..ec57bd3818a4 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -1,9 +1,12 @@ #include + #include #include -#include #include +#include +#include + unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { struct physdev_apic apic_op; -- cgit v1.2.3 From c8f64bf7df9e4858c051b6c1c09582359b97677d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 16 May 2012 19:03:25 +0300 Subject: x86/apic: Fix typo EIO_ACK -> EOI_ACK and document it Fix typo in the macro name and document the reason it has this value. Update users. Signed-off-by: Michael S. Tsirkin Cc: Avi Kivity Cc: Marcelo Tosatti Cc: gleb@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/37867b31b9330690af2e60a2a7c4cb4b1b070caf.1337184153.git.mst@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 2 +- arch/x86/platform/visws/visws_quirks.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 134bba00df09..c46bb99d5fb2 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -37,7 +37,7 @@ #define APIC_ARBPRI_MASK 0xFFu #define APIC_PROCPRI 0xA0 #define APIC_EOI 0xB0 -#define APIC_EIO_ACK 0x0 +#define APIC_EOI_ACK 0x0 /* Docs say 0 for future compat. */ #define APIC_RRR 0xC0 #define APIC_LDR 0xD0 #define APIC_LDR_MASK (0xFFu << 24) diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index c7abf13a213f..94d8a39332ec 100644 --- a/arch/x86/platform/visws/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -445,7 +445,7 @@ static void ack_cobalt_irq(struct irq_data *data) spin_lock_irqsave(&cobalt_lock, flags); disable_cobalt_irq(data); - apic_write(APIC_EOI, APIC_EIO_ACK); + apic_write(APIC_EOI, APIC_EOI_ACK); spin_unlock_irqrestore(&cobalt_lock, flags); } -- cgit v1.2.3 From 4ebcc243901c48ee3baba6bdf179c7315fa8806f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 16 May 2012 19:03:44 +0300 Subject: x86/apic: Use symbolic APIC_EOI_ACK Use the symbol instead of hard-coded numbers, now that the reason for the value is documented where the constant is defined we don't need to duplicate this explanation in code. Signed-off-by: Michael S. Tsirkin Cc: Avi Kivity Cc: Marcelo Tosatti Cc: gleb@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/ecbe4c79d69c172378e47e5a587ff5cd10293c9f.1337184153.git.mst@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d85410171260..a09e9ab0bbdf 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -463,9 +463,7 @@ static inline void ack_APIC_irq(void) * ack_APIC_irq() actually gets compiled as a single instruction * ... yummie. */ - - /* Docs say use 0 for future compatibility */ - apic_write(APIC_EOI, 0); + apic_write(APIC_EOI, APIC_EOI_ACK); } static inline unsigned default_get_apic_id(unsigned long x) -- cgit v1.2.3 From 2a43195d831997551da93e6b3c22c965e93fe9cc Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 16 May 2012 19:03:52 +0300 Subject: x86/apic: Add apic->eoi_write() callback Add eoi_write callback so that kvm can override eoi accesses without touching the rest of the apic. As a side-effect, this will enable a micro-optimization for apics using msr. Signed-off-by: Michael S. Tsirkin Cc: Avi Kivity Cc: Marcelo Tosatti Cc: gleb@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/0df425d746c49ac2ecc405174df87752869629d2.1337184153.git.mst@redhat.com [ tidied it up a bit ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 16 +++++++++++++++- arch/x86/kernel/apic/apic_flat_64.c | 2 ++ arch/x86/kernel/apic/apic_noop.c | 1 + arch/x86/kernel/apic/apic_numachip.c | 1 + arch/x86/kernel/apic/bigsmp_32.c | 1 + arch/x86/kernel/apic/es7000_32.c | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 + arch/x86/kernel/apic/probe_32.c | 1 + arch/x86/kernel/apic/summit_32.c | 1 + arch/x86/kernel/apic/x2apic_cluster.c | 1 + arch/x86/kernel/apic/x2apic_phys.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + 12 files changed, 28 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a09e9ab0bbdf..bf8d065dd977 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -351,6 +351,14 @@ struct apic { /* apic ops */ u32 (*read)(u32 reg); void (*write)(u32 reg, u32 v); + /* + * ->eoi_write() has the same signature as ->write(). + * + * Drivers can support both ->eoi_write() and ->write() by passing the same + * callback value. Kernel can override ->eoi_write() and fall back + * on write for EOI. + */ + void (*eoi_write)(u32 reg, u32 v); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); @@ -426,6 +434,11 @@ static inline void apic_write(u32 reg, u32 val) apic->write(reg, val); } +static inline void apic_eoi(void) +{ + apic->eoi_write(APIC_EOI, APIC_EOI_ACK); +} + static inline u64 apic_icr_read(void) { return apic->icr_read(); @@ -450,6 +463,7 @@ static inline u32 safe_apic_wait_icr_idle(void) static inline u32 apic_read(u32 reg) { return 0; } static inline void apic_write(u32 reg, u32 val) { } +static inline void apic_eoi(void) { } static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } @@ -463,7 +477,7 @@ static inline void ack_APIC_irq(void) * ack_APIC_irq() actually gets compiled as a single instruction * ... yummie. */ - apic_write(APIC_EOI, APIC_EOI_ACK); + apic_eoi(); } static inline unsigned default_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 359b6899a36c..0e881c46e8c8 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -227,6 +227,7 @@ static struct apic apic_flat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -386,6 +387,7 @@ static struct apic apic_physflat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 634ae6cdd5c9..a6e4c6e06c08 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -181,6 +181,7 @@ struct apic apic_noop = { .read = noop_apic_read, .write = noop_apic_write, + .eoi_write = noop_apic_write, .icr_read = noop_apic_icr_read, .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 23e75422e013..6ec6d5d297c3 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -295,6 +295,7 @@ static struct apic apic_numachip __refconst = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 0cdec7065aff..31fbdbfbf960 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -248,6 +248,7 @@ static struct apic apic_bigsmp = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e42d1d3b9134..db4ab1be3c79 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 00d2422ca7c9..f00a68cca37a 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index ff2c1b9aac4d..1b291da09e60 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -142,6 +142,7 @@ static struct apic apic_default = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index fea000b27f07..659897c00755 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -546,6 +546,7 @@ static struct apic apic_summit = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 48f3103b3c93..a5baa785a251 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 991e315f4227..834035666b8d 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -172,6 +172,7 @@ static struct apic apic_x2apic_phys = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 87bfa69e216e..5b0e3d0a3d2d 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, -- cgit v1.2.3 From 0ab711ae6ab0db7696b43c74f9ba9de4d7fc1deb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 16 May 2012 19:03:58 +0300 Subject: x86/apic: Implement EIO micro-optimization We know both register and value for eoi beforehand, so there's no need to check it and no need to do math to calculate the msr. Saves instructions/branches on each EOI when using x2apic. I looked at the objdump output to verify that the generated code looks right and actually is shorter. The real improvemements will be on the KVM guest side though, those come in a later patch. Signed-off-by: Michael S. Tsirkin Cc: Avi Kivity Cc: Marcelo Tosatti Cc: gleb@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/e019d1a125316f10d3e3a4b2f6bda41473f4fb72.1337184153.git.mst@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 5 +++++ arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bf8d065dd977..eaff4790ed96 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -138,6 +138,11 @@ static inline void native_apic_msr_write(u32 reg, u32 v) wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); } +static inline void native_apic_msr_eoi_write(u32 reg, u32 v) +{ + wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); +} + static inline u32 native_apic_msr_read(u32 reg) { u64 msr; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a5baa785a251..ff35cff0e1a7 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -260,7 +260,7 @@ static struct apic apic_x2apic_cluster = { .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 834035666b8d..c17e982db275 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -172,7 +172,7 @@ static struct apic apic_x2apic_phys = { .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 5b0e3d0a3d2d..c6d03f7a4401 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -404,7 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = { .read = native_apic_msr_read, .write = native_apic_msr_write, - .eoi_write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, -- cgit v1.2.3 From 3e7f3db001de6133db1c385c92eec944409a8b4f Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 10 May 2012 18:01:59 +0800 Subject: x86/tlb: Clean up and unify TLB_FLUSH_ALL definition Since sizeof(long) is 4 in x86_32 mode, and it's 8 in x86_64 mode, sizeof(long long) is also 8 byte in x86_64 mode. use long mode can fit TLB_FLUSH_ALL defination here both in 32 or 64 bits mode. Signed-off-by: Alex Shi Link: http://lkml.kernel.org/n/tip-evv5bekiipi2pmyzdsy8lkkw@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 169be8938b96..63af9098e6a5 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,11 +62,7 @@ static inline void __flush_tlb_one(unsigned long addr) __flush_tlb(); } -#ifdef CONFIG_X86_32 -# define TLB_FLUSH_ALL 0xffffffff -#else -# define TLB_FLUSH_ALL -1ULL -#endif +#define TLB_FLUSH_ALL -1UL /* * TLB flushing: -- cgit v1.2.3 From 20167d3421a089a1bf1bd680b150dc69c9506810 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 16 May 2012 14:06:26 +0100 Subject: x86-64: Fix accounting in kernel_physical_mapping_init() When finding a present and acceptable 2M/1G mapping, the number of pages mapped this way shouldn't be incremented (as it was already incremented when the earlier part of the mapping was established). Instead, last_map_addr needs to be updated in this case. Further, address increments were wrong in one place each in both phys_pmd_init() and phys_pud_init() (lacking the aligning down to the respective page boundary). As we're now doing the same calculation several times, fold it into a single instance using a local variable (matching how kernel_physical_mapping_init() itself does it at the PGD level). Observed during code inspection, not because of an actual problem. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4FB3C27202000078000841A0@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 436a0309db33..f9476a0f8cb6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -408,12 +408,12 @@ static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long page_size_mask, pgprot_t prot) { - unsigned long pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i = pmd_index(address); - for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { + for (; i < PTRS_PER_PMD; i++, address = next) { unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; @@ -427,6 +427,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, break; } + next = (address & PMD_MASK) + PMD_SIZE; + if (pmd_val(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); @@ -450,7 +452,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_2M)) { - pages++; + last_map_addr = next; continue; } new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); @@ -463,7 +465,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pfn_pte(address >> PAGE_SHIFT, __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); - last_map_addr = (address & PMD_MASK) + PMD_SIZE; + last_map_addr = next; continue; } @@ -483,11 +485,11 @@ static unsigned long __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long page_size_mask) { - unsigned long pages = 0; + unsigned long pages = 0, next; unsigned long last_map_addr = end; int i = pud_index(addr); - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { + for (; i < PTRS_PER_PUD; i++, addr = next) { unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -496,8 +498,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (addr >= end) break; - if (!after_bootmem && - !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { + next = (addr & PUD_MASK) + PUD_SIZE; + + if (!after_bootmem && !e820_any_mapped(addr, next, 0)) { set_pud(pud, __pud(0)); continue; } @@ -524,7 +527,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_1G)) { - pages++; + last_map_addr = next; continue; } prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); @@ -536,7 +539,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, set_pte((pte_t *)pud, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); spin_unlock(&init_mm.page_table_lock); - last_map_addr = (addr & PUD_MASK) + PUD_SIZE; + last_map_addr = next; continue; } -- cgit v1.2.3 From 5bcdf5e4fee3c45e1281c25e4941f2163cb28c65 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 18 May 2012 12:40:42 +0200 Subject: perf/x86: Update event scheduling constraints for AMD family 15h models This update is for newer family 15h cpu models from 0x02 to 0x1f. Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Stephane Eranian Cc: stable@vger.kernel.org # v2.6.39+ Link: http://lkml.kernel.org/r/1337337642-1621-1-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 95e7fe1c5f0b..9edc786aef89 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -493,6 +493,7 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x023 DE PERF_CTL[2:0] * 0x02D LS PERF_CTL[3] * 0x02E LS PERF_CTL[3,0] + * 0x031 LS PERF_CTL[2:0] (**) * 0x043 CU PERF_CTL[2:0] * 0x045 CU PERF_CTL[2:0] * 0x046 CU PERF_CTL[2:0] @@ -506,10 +507,12 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x0DD LS PERF_CTL[5:0] * 0x0DE LS PERF_CTL[5:0] * 0x0DF LS PERF_CTL[5:0] + * 0x1C0 EX PERF_CTL[5:3] * 0x1D6 EX PERF_CTL[5:0] * 0x1D8 EX PERF_CTL[5:0] * - * (*) depending on the umask all FPU counters may be used + * (*) depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time */ static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); @@ -559,6 +562,12 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev return &amd_f15_PMC3; case 0x02E: return &amd_f15_PMC30; + case 0x031: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + return &amd_f15_PMC20; + return &emptyconstraint; + case 0x1C0: + return &amd_f15_PMC53; default: return &amd_f15_PMC50; } -- cgit v1.2.3 From c54a354c1835e7412a53458891b9ea05361b4e8a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 08:31:44 -0700 Subject: x86, relocs: More relocations which may end up as absolute GNU ld 2.22.52.0.1 has a bug that it blindly changes symbols from section-relative to absolute if they are in a section of zero length. This turns the symbols __init_begin and __init_end into absolute symbols. Let the relocs program know that those should be treated as relative symbols. This bug is exposed by checkin 433de739bbc2 x86, realmode: 16-bit real-mode code support for relocs tool only in the sense that that checkin changes the relocs tool to report an error instead of silently generating a kernel which is broken if relocated. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: H.J. Lu Cc: Jarkko Sakkinen --- arch/x86/tools/relocs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 4df285450e8c..b49c2119295e 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -56,7 +56,11 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { * as absolute (typically defined outside any section in the linker script.) */ [S_REL] = - "^(__init_begin|__init_end|_end)$" + "^(__init_(begin|end)|" + "__x86_cpu_dev_(start|end)|" + "(__parainstructions|__alt_instructions)(|_end)|" + "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" + "_end)$" }; -- cgit v1.2.3 From 8a3b947c40cb36100f316ac0d433f4ae554ee4cc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 May 2012 09:52:01 -0700 Subject: x86, relocs: When printing an error, say relative or absolute When the relocs tool throws an error, let the error message say if it is an absolute or relative symbol. This should make it a lot more clear what action the programmer needs to take. Signed-off-by: H. Peter Anvin --- arch/x86/tools/relocs.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b49c2119295e..dce982d4bc31 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -570,10 +570,14 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), Elf32_Sym *sym; unsigned r_type; const char *symname; + int shn_abs; + rel = &sec->reltab[j]; sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; r_type = ELF32_R_TYPE(rel->r_info); + shn_abs = sym->st_shndx == SHN_ABS; + switch (r_type) { case R_386_NONE: case R_386_PC32: @@ -589,7 +593,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), symname = sym_name(sym_strtab, sym); if (!use_real_mode) goto bad; - if (sym->st_shndx == SHN_ABS) { + if (shn_abs) { if (is_reloc(S_ABS, symname)) break; else if (!is_reloc(S_SEG, symname)) @@ -605,7 +609,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), case R_386_32: symname = sym_name(sym_strtab, sym); - if (sym->st_shndx == SHN_ABS) { + if (shn_abs) { if (is_reloc(S_ABS, symname)) break; else if (!is_reloc(S_REL, symname)) @@ -623,7 +627,8 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym), break; bad: symname = sym_name(sym_strtab, sym); - die("Invalid %s relocation: %s\n", + die("Invalid %s %s relocation: %s\n", + shn_abs ? "absolute" : "relative", rel_type(r_type), symname); } } -- cgit v1.2.3 From 61f5446169046c217a5479517edac3a890c3bee7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 21 May 2012 00:02:45 -0700 Subject: x86, realmode: Move end signature into header.S The end signature was defined in wakeup_asm.S as it originally came from the ACPI wakeup code. However, we rely on the existence of the .signature section to expand .bss, otherwise we would have to include code to explicitly zero the .bss depending on the configuration. Since the expanded .bss is just in .init.data anyway, it's easier to always have it expanded. This fixes failures when compiled without CONFIG_ACPI_SLEEP. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: Jarkko Sakkinen --- arch/x86/realmode/rm/header.S | 9 +++++++++ arch/x86/realmode/rm/realmode.h | 5 +++++ arch/x86/realmode/rm/wakeup.h | 1 - arch/x86/realmode/rm/wakeup_asm.S | 6 +----- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 4612d5382791..fadf48378ada 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -7,6 +7,8 @@ #include #include +#include "realmode.h" + .section ".header", "a" .balign 16 @@ -30,3 +32,10 @@ GLOBAL(real_mode_header) .long pa_machine_real_restart_asm #endif END(real_mode_header) + + /* End signature, used to verify integrity */ + .section ".signature","a" + .balign 4 +GLOBAL(end_signature) + .long REALMODE_END_SIGNATURE +END(end_signature) diff --git a/arch/x86/realmode/rm/realmode.h b/arch/x86/realmode/rm/realmode.h index 15ab6335f843..d74cff6350ed 100644 --- a/arch/x86/realmode/rm/realmode.h +++ b/arch/x86/realmode/rm/realmode.h @@ -13,4 +13,9 @@ #endif /* __ASSEMBLY__ */ +/* + * Signature at the end of the realmode region + */ +#define REALMODE_END_SIGNATURE 0x65a22c82 + #endif /* ARCH_X86_REALMODE_RM_REALMODE_H */ diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h index 2dfaf06b8af1..9317e0042f24 100644 --- a/arch/x86/realmode/rm/wakeup.h +++ b/arch/x86/realmode/rm/wakeup.h @@ -33,7 +33,6 @@ extern struct wakeup_header wakeup_header; #define WAKEUP_HEADER_OFFSET 8 #define WAKEUP_HEADER_SIGNATURE 0x51ee1111 -#define WAKEUP_END_SIGNATURE 0x65a22c82 /* Wakeup behavior bits */ #define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 46108f05e04e..8905166b0bbb 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -85,7 +85,7 @@ ENTRY(wakeup_start) /* Check we really have everything... */ movl end_signature, %eax - cmpl $WAKEUP_END_SIGNATURE, %eax + cmpl $REALMODE_END_SIGNATURE, %eax jne bogus_real_magic /* Call the C code */ @@ -175,7 +175,3 @@ GLOBAL(wakeup_idt) .long 0 /* address */ .word 0 END(wakeup_idt) - - .section ".signature","a" -end_signature: - .long WAKEUP_END_SIGNATURE -- cgit v1.2.3 From 74bc491795420254f8b9c782ec654c9ba005d3ac Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Sun, 20 May 2012 17:24:28 -0600 Subject: x86/pci-calgary_64.c: Remove obsoleted simple_strtoul() usage Change calgary_parse_options() to call kstrtoul() instead of calling obsoleted simple_strtoul(). Signed-off-by: Shuah Khan Acked-by: Muli Ben-Yehuda Cc: jdmason@kudzu.us Link: http://lkml.kernel.org/r/1337556268.3126.5.camel@lorien2 Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6ac5782f4d6b..dbbfb261e62c 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1479,8 +1479,9 @@ cleanup: static int __init calgary_parse_options(char *p) { unsigned int bridge; + unsigned long val; size_t len; - char* endp; + ssize_t ret; while (*p) { if (!strncmp(p, "64k", 3)) @@ -1511,10 +1512,11 @@ static int __init calgary_parse_options(char *p) ++p; if (*p == '\0') break; - bridge = simple_strtoul(p, &endp, 0); - if (p == endp) + ret = kstrtoul(p, 0, &val); + if (ret) break; + bridge = val; if (bridge < MAX_PHB_BUS_NUM) { printk(KERN_INFO "Calgary: disabling " "translation for PHB %#x\n", bridge); -- cgit v1.2.3 From bdebaf80a02b854381fe212e0dac13c8c8edac57 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 May 2012 16:45:44 +0000 Subject: x86: Use generic time config Signed-off-by: Thomas Gleixner Signed-off-by: Anna-Maria Gleixner Link: http://lkml.kernel.org/r/20120518163104.630579708@glx-um.de Cc: x86@kernel.org --- arch/x86/Kconfig | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9866b0b77d8..3b0a9217836a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,6 +82,13 @@ config X86 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select DCACHE_WORD_ACCESS + select GENERIC_CMOS_UPDATE + select CLOCKSOURCE_WATCHDOG + select GENERIC_CLOCKEVENTS + select ARCH_CLOCKSOURCE_DATA if X86_64 + select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) + select GENERIC_TIME_VSYSCALL if X86_64 + select KTIME_SCALAR if X86_32 config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -96,23 +103,6 @@ config ARCH_DEFCONFIG default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 -config GENERIC_CMOS_UPDATE - def_bool y - -config CLOCKSOURCE_WATCHDOG - def_bool y - -config GENERIC_CLOCKEVENTS - def_bool y - -config ARCH_CLOCKSOURCE_DATA - def_bool y - depends on X86_64 - -config GENERIC_CLOCKEVENTS_BROADCAST - def_bool y - depends on X86_64 || (X86_32 && X86_LOCAL_APIC) - config LOCKDEP_SUPPORT def_bool y @@ -166,10 +156,6 @@ config ARCH_HAS_CPU_IDLE_WAIT config GENERIC_CALIBRATE_DELAY def_bool y -config GENERIC_TIME_VSYSCALL - bool - default X86_64 - config ARCH_HAS_CPU_RELAX def_bool y @@ -236,9 +222,6 @@ config ARCH_HWEIGHT_CFLAGS default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 -config KTIME_SCALAR - def_bool X86_32 - config ARCH_CPU_PROBE_RELEASE def_bool y depends on HOTPLUG_CPU -- cgit v1.2.3 From 0a2b9a6ea93650b8a00f9fd5ee8fdd25671e2df6 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 29 Dec 2011 13:09:51 +0100 Subject: X86: integrate CMA with DMA-mapping subsystem This patch adds support for CMA to dma-mapping subsystem for x86 architecture that uses common pci-dma/pci-nommu implementation. This allows to test CMA on KVM/QEMU and a lot of common x86 boxes. Signed-off-by: Marek Szyprowski Signed-off-by: Kyungmin Park CC: Michal Nazarewicz Acked-by: Arnd Bergmann --- arch/x86/Kconfig | 1 + arch/x86/include/asm/dma-contiguous.h | 13 +++++++++++++ arch/x86/include/asm/dma-mapping.h | 5 +++++ arch/x86/kernel/pci-dma.c | 18 ++++++++++++++++-- arch/x86/kernel/pci-nommu.c | 8 +------- arch/x86/kernel/setup.c | 2 ++ 6 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 arch/x86/include/asm/dma-contiguous.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9866b0b77d8..7cbdfdac3c7c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS + select HAVE_DMA_CONTIGUOUS if !SWIOTLB select HAVE_KRETPROBES select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h new file mode 100644 index 000000000000..c09241659971 --- /dev/null +++ b/arch/x86/include/asm/dma-contiguous.h @@ -0,0 +1,13 @@ +#ifndef ASMX86_DMA_CONTIGUOUS_H +#define ASMX86_DMA_CONTIGUOUS_H + +#ifdef __KERNEL__ + +#include +#include + +static inline void +dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { } + +#endif +#endif diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 4b4331d71935..7b9227b44b9b 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_ISA # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) @@ -62,6 +63,10 @@ extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, struct dma_attrs *attrs); +extern void dma_generic_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs); + static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { if (!dev->dma_mask) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 3003250ac51d..62c9457ccd2f 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,14 +100,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, struct dma_attrs *attrs) { unsigned long dma_mask; - struct page *page; + struct page *page = NULL; + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; dma_mask = dma_alloc_coherent_mask(dev, flag); flag |= __GFP_ZERO; again: - page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!(flag & GFP_ATOMIC)) + page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (!page) + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); if (!page) return NULL; @@ -127,6 +131,16 @@ again: return page_address(page); } +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, struct dma_attrs *attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct page *page = virt_to_page(vaddr); + + if (!dma_release_from_contiguous(dev, page, count)) + free_pages((unsigned long)vaddr, get_order(size)); +} + /* * See for the iommu kernel * parameter documentation. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index f96050685b46..871be4a84c7d 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} - static void nommu_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) @@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev, struct dma_map_ops nommu_dma_ops = { .alloc = dma_generic_alloc_coherent, - .free = nommu_free_coherent, + .free = dma_generic_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, .sync_single_for_device = nommu_sync_single_for_device, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1a2901562059..d6c956e674cc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -934,6 +935,7 @@ void __init setup_arch(char **cmdline_p) } #endif memblock.current_limit = get_max_mapped(); + dma_contiguous_reserve(0); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. -- cgit v1.2.3 From 2f1bd67d544d3c086fb5101513f4b6c8f4291b43 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 21 May 2012 09:19:38 -0400 Subject: xen/smp: unbind irqworkX when unplugging vCPUs. The git commit 1ff2b0c303698e486f1e0886b4d9876200ef8ca5 "xen: implement IRQ_WORK_VECTOR handler" added the functionality to have a per-cpu "irqworkX" for the IPI APIC functionality. However it missed the unbind when a vCPU is unplugged resulting in an orphaned per-cpu interrupt line for unplugged vCPU: 30: 216 0 xen-dyn-event hvc_console 31: 810 4 xen-dyn-event eth0 32: 29 0 xen-dyn-event blkif - 36: 0 0 xen-percpu-ipi irqwork2 - 37: 287 0 xen-dyn-event xenbus + 36: 287 0 xen-dyn-event xenbus NMI: 0 0 Non-maskable interrupts LOC: 0 0 Local timer interrupts SPU: 0 0 Spurious interrupts Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3ec3f8eb19fc..ce9e98b1e69c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -419,6 +419,7 @@ static void xen_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); -- cgit v1.2.3 From 29d679ffd850ea37a303bb930142be14982611e4 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 8 May 2012 17:56:12 +0200 Subject: x86, printk: Add missing KERN_CONT to NMI selftest Fix this behaviour: ---------------- | NMI testsuite: -------------------- remote IPI: ok | local IPI: ok | Revealed due to a new modification to printk(). Signed-off-by: Sasha Levin Link: http://lkml.kernel.org/r/1336492573-17530-3-git-send-email-levinsasha928@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/nmi_selftest.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 2c39dcd510fa..9f11dd3b6577 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -117,15 +117,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected) unexpected_testcase_failures++; if (nmi_fail == FAILURE) - printk("FAILED |"); + printk(KERN_CONT "FAILED |"); else if (nmi_fail == TIMEOUT) - printk("TIMEOUT|"); + printk(KERN_CONT "TIMEOUT|"); else - printk("ERROR |"); + printk(KERN_CONT "ERROR |"); dump_stack(); } else { testcase_successes++; - printk(" ok |"); + printk(KERN_CONT " ok |"); } testcase_total++; @@ -150,10 +150,10 @@ void __init nmi_selftest(void) print_testname("remote IPI"); dotest(remote_ipi, SUCCESS); - printk("\n"); + printk(KERN_CONT "\n"); print_testname("local IPI"); dotest(local_ipi, SUCCESS); - printk("\n"); + printk(KERN_CONT "\n"); cleanup_nmi_testsuite(); -- cgit v1.2.3 From 68c2c39a76b094e9b2773e5846424ea674bf2c46 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 21 May 2012 16:54:10 +0100 Subject: xen: do not map the same GSI twice in PVHVM guests. PV on HVM guests map GSIs into event channels. At restore time the event channels are resumed by restore_pirqs. Device drivers might try to register the same GSI again through ACPI at restore time, but the GSI has already been mapped and bound by restore_pirqs. This patch detects these situations and avoids mapping the same GSI multiple times. Without this patch we get: (XEN) irq.c:2235: dom4: pirq 23 or emuirq 28 already mapped and waste a pirq. CC: stable@kernel.org Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/pci/xen.c | 4 ++++ drivers/xen/events.c | 5 +++-- include/xen/events.h | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 7415aa927913..56ab74989cf1 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -64,6 +64,10 @@ static int xen_register_pirq(u32 gsi, int gsi_override, int triggering, int shareable = 0; char *name; + irq = xen_irq_from_gsi(gsi); + if (irq > 0) + return irq; + if (set_pirq) pirq = gsi; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4b33acd8ed4e..faae2f910ad2 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -611,7 +611,7 @@ static void disable_pirq(struct irq_data *data) disable_dynirq(data); } -static int find_irq_by_gsi(unsigned gsi) +int xen_irq_from_gsi(unsigned gsi) { struct irq_info *info; @@ -625,6 +625,7 @@ static int find_irq_by_gsi(unsigned gsi) return -1; } +EXPORT_SYMBOL_GPL(xen_irq_from_gsi); /* * Do not make any assumptions regarding the relationship between the @@ -644,7 +645,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, mutex_lock(&irq_mapping_update_lock); - irq = find_irq_by_gsi(gsi); + irq = xen_irq_from_gsi(gsi); if (irq != -1) { printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", irq, gsi); diff --git a/include/xen/events.h b/include/xen/events.h index 0f773708e02c..04399b28e821 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -103,6 +103,9 @@ int xen_irq_from_pirq(unsigned pirq); /* Return the pirq allocated to the irq. */ int xen_pirq_from_irq(unsigned irq); +/* Return the irq allocated to the gsi */ +int xen_irq_from_gsi(unsigned gsi); + /* Determine whether to ignore this IRQ if it is passed to a guest. */ int xen_test_irq_shared(int irq); -- cgit v1.2.3 From 3b7d15bde54be81e3edd773724d85d20ae42a4da Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Apr 2012 03:27:28 -0400 Subject: um: ->restart_block.fn needs to be reset on sigreturn Signed-off-by: Al Viro --- arch/um/kernel/signal.c | 3 --- arch/x86/um/signal.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index fb12f4c5e649..0dfcef92ec91 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -29,9 +29,6 @@ static int handle_signal(struct pt_regs *regs, unsigned long signr, unsigned long sp; int err; - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - /* Did we come from a system call? */ if (PT_REGS_SYSCALL_NR(regs) >= 0) { /* If so, check system call restarting.. */ diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 4883b9546016..72eafa6c6a52 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -156,6 +156,9 @@ static int copy_sc_from_user(struct pt_regs *regs, struct sigcontext sc; int err, pid; + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + err = copy_from_user(&sc, from, sizeof(sc)); if (err) return err; -- cgit v1.2.3 From b2d668da9307c4c163dd603d2bb3cadb10f9fd37 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 21 May 2012 20:51:24 +0300 Subject: x86, relocs: Build clean fix relocs was not cleaned up when "make clean" is issued. This patch fixes the issue. Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1337622684-6834-1-git-send-email-jarkko.sakkinen@intel.com Signed-off-by: H. Peter Anvin Cc: v3.4 --- arch/x86/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 94e91e401da9..b1c611e6da67 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -206,6 +206,7 @@ archclean: $(Q)rm -rf $(objtree)/arch/i386 $(Q)rm -rf $(objtree)/arch/x86_64 $(Q)$(MAKE) $(clean)=$(boot) + $(Q)$(MAKE) $(clean)=arch/x86/tools define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' -- cgit v1.2.3 From e47b65b032f2997aa0a7392ecdf656c86d4d7561 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Mon, 21 May 2012 20:45:37 +0200 Subject: net: drop NET dependency from HAVE_BPF_JIT There is no point having the NET dependency on the select target, as it forces all users to depend on NET to tell they support BPF_JIT. Move the config option to the bottom of the file - this could be a nice place also for future "selectable" config symbols. Fix up all users to drop the dependency on NET now that it is not required to supress warnings for non-NET builds. Reported-by: Linus Torvalds Signed-off-by: Sam Ravnborg Acked-by: David Miller Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- net/Kconfig | 7 ++++--- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 36586dba6fa6..e19ed3fd3089 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -33,7 +33,7 @@ config ARM select GENERIC_IRQ_SHOW select CPU_PM if (SUSPEND || CPU_IDLE) select GENERIC_PCI_IOMAP - select HAVE_BPF_JIT if NET + select HAVE_BPF_JIT help The ARM series is a line of low-power-consumption RISC chip designs licensed by ARM Ltd and targeted at embedded applications and diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index feab3bad6d0f..73ec03945717 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -141,7 +141,7 @@ config PPC select IRQ_FORCED_THREADING select HAVE_RCU_TABLE_FREE if SMP select HAVE_SYSCALL_TRACEPOINTS - select HAVE_BPF_JIT if (PPC64 && NET) + select HAVE_BPF_JIT if PPC64 select HAVE_ARCH_JUMP_LABEL select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 6c49ed2ee786..d176c03274c5 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -30,7 +30,7 @@ config SPARC select USE_GENERIC_SMP_HELPERS if SMP select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 - select HAVE_BPF_JIT if NET + select HAVE_BPF_JIT config SPARC32 def_bool !64BIT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c9866b0b77d8..25f87bccbf8f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -77,7 +77,7 @@ config X86 select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP - select HAVE_BPF_JIT if (X86_64 && NET) + select HAVE_BPF_JIT if X86_64 select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP diff --git a/net/Kconfig b/net/Kconfig index 1e47bd03dde3..245831bec09a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -246,9 +246,6 @@ config BQL select DQL default y -config HAVE_BPF_JIT - bool - config BPF_JIT bool "enable BPF Just In Time compiler" depends on HAVE_BPF_JIT @@ -340,3 +337,7 @@ source "net/nfc/Kconfig" endif # if NET + +# Used by archs to tell that they support BPF_JIT +config HAVE_BPF_JIT + bool -- cgit v1.2.3 From 243412be9cecfc7fddebb912a277b76119fd4ecd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 May 2012 00:05:58 -0400 Subject: um/x86: merge (and trim) 32- and 64-bit variants of ptrace.h Signed-off-by: Al Viro --- arch/um/kernel/process.c | 2 +- arch/um/kernel/skas/syscall.c | 2 +- arch/x86/um/asm/elf.h | 42 +++++++------- arch/x86/um/asm/ptrace.h | 34 ++++++++++++ arch/x86/um/asm/ptrace_32.h | 23 -------- arch/x86/um/asm/ptrace_64.h | 26 --------- arch/x86/um/shared/sysdep/ptrace.h | 67 ++++++++++++++++++++-- arch/x86/um/shared/sysdep/ptrace_32.h | 92 +++---------------------------- arch/x86/um/shared/sysdep/ptrace_64.h | 101 ++-------------------------------- arch/x86/um/signal.c | 22 ++++---- arch/x86/um/sysrq_32.c | 8 +-- arch/x86/um/sysrq_64.c | 8 +-- arch/x86/um/tls_32.c | 2 +- 13 files changed, 150 insertions(+), 279 deletions(-) (limited to 'arch/x86') diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 4d9af3172d9f..3a2235e0abc3 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -196,7 +196,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, if (current->thread.forking) { memcpy(&p->thread.regs.regs, ®s->regs, sizeof(p->thread.regs.regs)); - REGS_SET_SYSCALL_RETURN(p->thread.regs.regs.gp, 0); + UPT_SET_SYSCALL_RETURN(&p->thread.regs.regs, 0); if (sp != 0) REGS_SP(p->thread.regs.regs.gp) = sp; diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index f5173e1ec3ac..05fbeb480e0b 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -34,7 +34,7 @@ void handle_syscall(struct uml_pt_regs *r) result = -ENOSYS; else result = EXECUTE_SYSCALL(syscall, regs); - REGS_SET_SYSCALL_RETURN(r->gp, result); + UPT_SET_SYSCALL_RETURN(r, result); syscall_trace(r, 1); } diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index f3b0633b69a1..0e07adc8cbe4 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -34,25 +34,25 @@ #define ELF_ARCH EM_386 #define ELF_PLAT_INIT(regs, load_addr) do { \ - PT_REGS_EBX(regs) = 0; \ - PT_REGS_ECX(regs) = 0; \ - PT_REGS_EDX(regs) = 0; \ - PT_REGS_ESI(regs) = 0; \ - PT_REGS_EDI(regs) = 0; \ - PT_REGS_EBP(regs) = 0; \ - PT_REGS_EAX(regs) = 0; \ + PT_REGS_BX(regs) = 0; \ + PT_REGS_CX(regs) = 0; \ + PT_REGS_DX(regs) = 0; \ + PT_REGS_SI(regs) = 0; \ + PT_REGS_DI(regs) = 0; \ + PT_REGS_BP(regs) = 0; \ + PT_REGS_AX(regs) = 0; \ } while (0) /* Shamelessly stolen from include/asm-i386/elf.h */ #define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ - pr_reg[0] = PT_REGS_EBX(regs); \ - pr_reg[1] = PT_REGS_ECX(regs); \ - pr_reg[2] = PT_REGS_EDX(regs); \ - pr_reg[3] = PT_REGS_ESI(regs); \ - pr_reg[4] = PT_REGS_EDI(regs); \ - pr_reg[5] = PT_REGS_EBP(regs); \ - pr_reg[6] = PT_REGS_EAX(regs); \ + pr_reg[0] = PT_REGS_BX(regs); \ + pr_reg[1] = PT_REGS_CX(regs); \ + pr_reg[2] = PT_REGS_DX(regs); \ + pr_reg[3] = PT_REGS_SI(regs); \ + pr_reg[4] = PT_REGS_DI(regs); \ + pr_reg[5] = PT_REGS_BP(regs); \ + pr_reg[6] = PT_REGS_AX(regs); \ pr_reg[7] = PT_REGS_DS(regs); \ pr_reg[8] = PT_REGS_ES(regs); \ /* fake once used fs and gs selectors? */ \ @@ -130,13 +130,13 @@ do { \ #define ELF_ARCH EM_X86_64 #define ELF_PLAT_INIT(regs, load_addr) do { \ - PT_REGS_RBX(regs) = 0; \ - PT_REGS_RCX(regs) = 0; \ - PT_REGS_RDX(regs) = 0; \ - PT_REGS_RSI(regs) = 0; \ - PT_REGS_RDI(regs) = 0; \ - PT_REGS_RBP(regs) = 0; \ - PT_REGS_RAX(regs) = 0; \ + PT_REGS_BX(regs) = 0; \ + PT_REGS_CX(regs) = 0; \ + PT_REGS_DX(regs) = 0; \ + PT_REGS_SI(regs) = 0; \ + PT_REGS_DI(regs) = 0; \ + PT_REGS_BP(regs) = 0; \ + PT_REGS_AX(regs) = 0; \ PT_REGS_R8(regs) = 0; \ PT_REGS_R9(regs) = 0; \ PT_REGS_R10(regs) = 0; \ diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h index c8aca8c501b0..950dfb7b8417 100644 --- a/arch/x86/um/asm/ptrace.h +++ b/arch/x86/um/asm/ptrace.h @@ -1,5 +1,39 @@ +#ifndef __UM_X86_PTRACE_H +#define __UM_X86_PTRACE_H + #ifdef CONFIG_X86_32 # include "ptrace_32.h" #else # include "ptrace_64.h" #endif + +#define PT_REGS_AX(r) UPT_AX(&(r)->regs) +#define PT_REGS_BX(r) UPT_BX(&(r)->regs) +#define PT_REGS_CX(r) UPT_CX(&(r)->regs) +#define PT_REGS_DX(r) UPT_DX(&(r)->regs) + +#define PT_REGS_SI(r) UPT_SI(&(r)->regs) +#define PT_REGS_DI(r) UPT_DI(&(r)->regs) +#define PT_REGS_BP(r) UPT_BP(&(r)->regs) +#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) + +#define PT_REGS_CS(r) UPT_CS(&(r)->regs) +#define PT_REGS_SS(r) UPT_SS(&(r)->regs) +#define PT_REGS_DS(r) UPT_DS(&(r)->regs) +#define PT_REGS_ES(r) UPT_ES(&(r)->regs) + +#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_AX(r) +#define PT_REGS_SYSCALL_RET(r) PT_REGS_AX(r) + +#define PT_FIX_EXEC_STACK(sp) do ; while(0) + +#define profile_pc(regs) PT_REGS_IP(regs) + +#define UPT_RESTART_SYSCALL(r) (UPT_IP(r) -= 2) +#define UPT_SET_SYSCALL_RETURN(r, res) (UPT_AX(r) = (res)) + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_AX(regs); +} +#endif /* __UM_X86_PTRACE_H */ diff --git a/arch/x86/um/asm/ptrace_32.h b/arch/x86/um/asm/ptrace_32.h index 5d2a59112537..2cf225351b65 100644 --- a/arch/x86/um/asm/ptrace_32.h +++ b/arch/x86/um/asm/ptrace_32.h @@ -11,29 +11,6 @@ #include "linux/compiler.h" #include "asm/ptrace-generic.h" -#define PT_REGS_EAX(r) UPT_EAX(&(r)->regs) -#define PT_REGS_EBX(r) UPT_EBX(&(r)->regs) -#define PT_REGS_ECX(r) UPT_ECX(&(r)->regs) -#define PT_REGS_EDX(r) UPT_EDX(&(r)->regs) -#define PT_REGS_ESI(r) UPT_ESI(&(r)->regs) -#define PT_REGS_EDI(r) UPT_EDI(&(r)->regs) -#define PT_REGS_EBP(r) UPT_EBP(&(r)->regs) - -#define PT_REGS_CS(r) UPT_CS(&(r)->regs) -#define PT_REGS_SS(r) UPT_SS(&(r)->regs) -#define PT_REGS_DS(r) UPT_DS(&(r)->regs) -#define PT_REGS_ES(r) UPT_ES(&(r)->regs) -#define PT_REGS_FS(r) UPT_FS(&(r)->regs) -#define PT_REGS_GS(r) UPT_GS(&(r)->regs) - -#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) - -#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_EAX(r) -#define PT_REGS_SYSCALL_RET(r) PT_REGS_EAX(r) -#define PT_FIX_EXEC_STACK(sp) do ; while(0) - -#define profile_pc(regs) PT_REGS_IP(regs) - #define user_mode(r) UPT_IS_USER(&(r)->regs) /* diff --git a/arch/x86/um/asm/ptrace_64.h b/arch/x86/um/asm/ptrace_64.h index 706a0d80545c..ea7bff394320 100644 --- a/arch/x86/um/asm/ptrace_64.h +++ b/arch/x86/um/asm/ptrace_64.h @@ -15,13 +15,6 @@ #define HOST_AUDIT_ARCH AUDIT_ARCH_X86_64 -#define PT_REGS_RBX(r) UPT_RBX(&(r)->regs) -#define PT_REGS_RCX(r) UPT_RCX(&(r)->regs) -#define PT_REGS_RDX(r) UPT_RDX(&(r)->regs) -#define PT_REGS_RSI(r) UPT_RSI(&(r)->regs) -#define PT_REGS_RDI(r) UPT_RDI(&(r)->regs) -#define PT_REGS_RBP(r) UPT_RBP(&(r)->regs) -#define PT_REGS_RAX(r) UPT_RAX(&(r)->regs) #define PT_REGS_R8(r) UPT_R8(&(r)->regs) #define PT_REGS_R9(r) UPT_R9(&(r)->regs) #define PT_REGS_R10(r) UPT_R10(&(r)->regs) @@ -31,27 +24,8 @@ #define PT_REGS_R14(r) UPT_R14(&(r)->regs) #define PT_REGS_R15(r) UPT_R15(&(r)->regs) -#define PT_REGS_FS(r) UPT_FS(&(r)->regs) -#define PT_REGS_GS(r) UPT_GS(&(r)->regs) -#define PT_REGS_DS(r) UPT_DS(&(r)->regs) -#define PT_REGS_ES(r) UPT_ES(&(r)->regs) -#define PT_REGS_SS(r) UPT_SS(&(r)->regs) -#define PT_REGS_CS(r) UPT_CS(&(r)->regs) - -#define PT_REGS_ORIG_RAX(r) UPT_ORIG_RAX(&(r)->regs) -#define PT_REGS_RIP(r) UPT_IP(&(r)->regs) -#define PT_REGS_SP(r) UPT_SP(&(r)->regs) - -#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) - /* XXX */ #define user_mode(r) UPT_IS_USER(&(r)->regs) -#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_RAX(r) -#define PT_REGS_SYSCALL_RET(r) PT_REGS_RAX(r) - -#define PT_FIX_EXEC_STACK(sp) do ; while(0) - -#define profile_pc(regs) PT_REGS_IP(regs) struct user_desc; diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 2bbe1ec2d96a..6ce2d76eb908 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -1,15 +1,74 @@ #ifndef __SYSDEP_X86_PTRACE_H #define __SYSDEP_X86_PTRACE_H +#include +#include "sysdep/faultinfo.h" + +#define MAX_REG_OFFSET (UM_FRAME_SIZE) +#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long)) + +#define REGS_IP(r) ((r)[HOST_IP]) +#define REGS_SP(r) ((r)[HOST_SP]) +#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) +#define REGS_AX(r) ((r)[HOST_AX]) +#define REGS_BX(r) ((r)[HOST_BX]) +#define REGS_CX(r) ((r)[HOST_CX]) +#define REGS_DX(r) ((r)[HOST_DX]) +#define REGS_SI(r) ((r)[HOST_SI]) +#define REGS_DI(r) ((r)[HOST_DI]) +#define REGS_BP(r) ((r)[HOST_BP]) +#define REGS_CS(r) ((r)[HOST_CS]) +#define REGS_SS(r) ((r)[HOST_SS]) +#define REGS_DS(r) ((r)[HOST_DS]) +#define REGS_ES(r) ((r)[HOST_ES]) + +#define UPT_IP(r) REGS_IP((r)->gp) +#define UPT_SP(r) REGS_SP((r)->gp) +#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) +#define UPT_AX(r) REGS_AX((r)->gp) +#define UPT_BX(r) REGS_BX((r)->gp) +#define UPT_CX(r) REGS_CX((r)->gp) +#define UPT_DX(r) REGS_DX((r)->gp) +#define UPT_SI(r) REGS_SI((r)->gp) +#define UPT_DI(r) REGS_DI((r)->gp) +#define UPT_BP(r) REGS_BP((r)->gp) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_SS(r) REGS_SS((r)->gp) +#define UPT_DS(r) REGS_DS((r)->gp) +#define UPT_ES(r) REGS_ES((r)->gp) + #ifdef __i386__ #include "ptrace_32.h" #else #include "ptrace_64.h" #endif -static inline long regs_return_value(struct uml_pt_regs *regs) -{ - return UPT_SYSCALL_RET(regs); -} +struct syscall_args { + unsigned long args[6]; +}; + +#define SYSCALL_ARGS(r) ((struct syscall_args) \ + { .args = { UPT_SYSCALL_ARG1(r), \ + UPT_SYSCALL_ARG2(r), \ + UPT_SYSCALL_ARG3(r), \ + UPT_SYSCALL_ARG4(r), \ + UPT_SYSCALL_ARG5(r), \ + UPT_SYSCALL_ARG6(r) } } ) + +struct uml_pt_regs { + unsigned long gp[MAX_REG_NR]; + unsigned long fp[MAX_FP_NR]; + struct faultinfo faultinfo; + long syscall; + int is_user; +}; + +#define EMPTY_UML_PT_REGS { } + +#define UPT_SYSCALL_NR(r) ((r)->syscall) +#define UPT_FAULTINFO(r) (&(r)->faultinfo) +#define UPT_IS_USER(r) ((r)->is_user) + +extern int user_context(unsigned long sp); #endif /* __SYSDEP_X86_PTRACE_H */ diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h index befd1df32ed0..b94a108de1dc 100644 --- a/arch/x86/um/shared/sysdep/ptrace_32.h +++ b/arch/x86/um/shared/sysdep/ptrace_32.h @@ -6,11 +6,7 @@ #ifndef __SYSDEP_I386_PTRACE_H #define __SYSDEP_I386_PTRACE_H -#include -#include "sysdep/faultinfo.h" - -#define MAX_REG_NR (UM_FRAME_SIZE / sizeof(unsigned long)) -#define MAX_REG_OFFSET (UM_FRAME_SIZE) +#define MAX_FP_NR HOST_FPX_SIZE static inline void update_debugregs(int seq) {} @@ -24,90 +20,16 @@ void set_using_sysemu(int value); int get_using_sysemu(void); extern int sysemu_supported; -#define REGS_IP(r) ((r)[HOST_IP]) -#define REGS_SP(r) ((r)[HOST_SP]) -#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) -#define REGS_EAX(r) ((r)[HOST_AX]) -#define REGS_EBX(r) ((r)[HOST_BX]) -#define REGS_ECX(r) ((r)[HOST_CX]) -#define REGS_EDX(r) ((r)[HOST_DX]) -#define REGS_ESI(r) ((r)[HOST_SI]) -#define REGS_EDI(r) ((r)[HOST_DI]) -#define REGS_EBP(r) ((r)[HOST_BP]) -#define REGS_CS(r) ((r)[HOST_CS]) -#define REGS_SS(r) ((r)[HOST_SS]) -#define REGS_DS(r) ((r)[HOST_DS]) -#define REGS_ES(r) ((r)[HOST_ES]) -#define REGS_FS(r) ((r)[HOST_FS]) -#define REGS_GS(r) ((r)[HOST_GS]) - -#define REGS_SET_SYSCALL_RETURN(r, res) REGS_EAX(r) = (res) - -#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) -#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) - #ifndef PTRACE_SYSEMU_SINGLESTEP #define PTRACE_SYSEMU_SINGLESTEP 32 #endif -struct uml_pt_regs { - unsigned long gp[MAX_REG_NR]; - unsigned long fp[HOST_FPX_SIZE]; - struct faultinfo faultinfo; - long syscall; - int is_user; -}; - -#define EMPTY_UML_PT_REGS { } - -#define UPT_IP(r) REGS_IP((r)->gp) -#define UPT_SP(r) REGS_SP((r)->gp) -#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) -#define UPT_EAX(r) REGS_EAX((r)->gp) -#define UPT_EBX(r) REGS_EBX((r)->gp) -#define UPT_ECX(r) REGS_ECX((r)->gp) -#define UPT_EDX(r) REGS_EDX((r)->gp) -#define UPT_ESI(r) REGS_ESI((r)->gp) -#define UPT_EDI(r) REGS_EDI((r)->gp) -#define UPT_EBP(r) REGS_EBP((r)->gp) -#define UPT_ORIG_EAX(r) ((r)->syscall) -#define UPT_CS(r) REGS_CS((r)->gp) -#define UPT_SS(r) REGS_SS((r)->gp) -#define UPT_DS(r) REGS_DS((r)->gp) -#define UPT_ES(r) REGS_ES((r)->gp) -#define UPT_FS(r) REGS_FS((r)->gp) -#define UPT_GS(r) REGS_GS((r)->gp) - -#define UPT_SYSCALL_ARG1(r) UPT_EBX(r) -#define UPT_SYSCALL_ARG2(r) UPT_ECX(r) -#define UPT_SYSCALL_ARG3(r) UPT_EDX(r) -#define UPT_SYSCALL_ARG4(r) UPT_ESI(r) -#define UPT_SYSCALL_ARG5(r) UPT_EDI(r) -#define UPT_SYSCALL_ARG6(r) UPT_EBP(r) - -extern int user_context(unsigned long sp); - -#define UPT_IS_USER(r) ((r)->is_user) - -struct syscall_args { - unsigned long args[6]; -}; - -#define SYSCALL_ARGS(r) ((struct syscall_args) \ - { .args = { UPT_SYSCALL_ARG1(r), \ - UPT_SYSCALL_ARG2(r), \ - UPT_SYSCALL_ARG3(r), \ - UPT_SYSCALL_ARG4(r), \ - UPT_SYSCALL_ARG5(r), \ - UPT_SYSCALL_ARG6(r) } } ) - -#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) - -#define UPT_ORIG_SYSCALL(r) UPT_EAX(r) -#define UPT_SYSCALL_NR(r) UPT_ORIG_EAX(r) -#define UPT_SYSCALL_RET(r) UPT_EAX(r) - -#define UPT_FAULTINFO(r) (&(r)->faultinfo) +#define UPT_SYSCALL_ARG1(r) UPT_BX(r) +#define UPT_SYSCALL_ARG2(r) UPT_CX(r) +#define UPT_SYSCALL_ARG3(r) UPT_DX(r) +#define UPT_SYSCALL_ARG4(r) UPT_SI(r) +#define UPT_SYSCALL_ARG5(r) UPT_DI(r) +#define UPT_SYSCALL_ARG6(r) UPT_BP(r) extern void arch_init_registers(int pid); diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h index 031edc53ac57..919789f1071e 100644 --- a/arch/x86/um/shared/sysdep/ptrace_64.h +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -8,22 +8,8 @@ #ifndef __SYSDEP_X86_64_PTRACE_H #define __SYSDEP_X86_64_PTRACE_H -#include -#include "sysdep/faultinfo.h" +#define MAX_FP_NR HOST_FP_SIZE -#define MAX_REG_OFFSET (UM_FRAME_SIZE) -#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long)) - -#define REGS_IP(r) ((r)[HOST_IP]) -#define REGS_SP(r) ((r)[HOST_SP]) - -#define REGS_RBX(r) ((r)[HOST_BX]) -#define REGS_RCX(r) ((r)[HOST_CX]) -#define REGS_RDX(r) ((r)[HOST_DX]) -#define REGS_RSI(r) ((r)[HOST_SI]) -#define REGS_RDI(r) ((r)[HOST_DI]) -#define REGS_RBP(r) ((r)[HOST_BP]) -#define REGS_RAX(r) ((r)[HOST_AX]) #define REGS_R8(r) ((r)[HOST_R8]) #define REGS_R9(r) ((r)[HOST_R9]) #define REGS_R10(r) ((r)[HOST_R10]) @@ -32,9 +18,6 @@ #define REGS_R13(r) ((r)[HOST_R13]) #define REGS_R14(r) ((r)[HOST_R14]) #define REGS_R15(r) ((r)[HOST_R15]) -#define REGS_CS(r) ((r)[HOST_CS]) -#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) -#define REGS_SS(r) ((r)[HOST_SS]) #define HOST_FS_BASE 21 #define HOST_GS_BASE 22 @@ -58,45 +41,6 @@ #define GS (HOST_GS * sizeof(long)) #endif -#define REGS_FS_BASE(r) ((r)[HOST_FS_BASE]) -#define REGS_GS_BASE(r) ((r)[HOST_GS_BASE]) -#define REGS_DS(r) ((r)[HOST_DS]) -#define REGS_ES(r) ((r)[HOST_ES]) -#define REGS_FS(r) ((r)[HOST_FS]) -#define REGS_GS(r) ((r)[HOST_GS]) - -#define REGS_ORIG_RAX(r) ((r)[HOST_ORIG_AX]) - -#define REGS_SET_SYSCALL_RETURN(r, res) REGS_RAX(r) = (res) - -#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) -#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) - -#define REGS_FAULT_ADDR(r) ((r)->fault_addr) - -#define REGS_FAULT_WRITE(r) FAULT_WRITE((r)->fault_type) - -#define REGS_TRAP(r) ((r)->trap_type) - -#define REGS_ERR(r) ((r)->fault_type) - -struct uml_pt_regs { - unsigned long gp[MAX_REG_NR]; - unsigned long fp[HOST_FP_SIZE]; - struct faultinfo faultinfo; - long syscall; - int is_user; -}; - -#define EMPTY_UML_PT_REGS { } - -#define UPT_RBX(r) REGS_RBX((r)->gp) -#define UPT_RCX(r) REGS_RCX((r)->gp) -#define UPT_RDX(r) REGS_RDX((r)->gp) -#define UPT_RSI(r) REGS_RSI((r)->gp) -#define UPT_RDI(r) REGS_RDI((r)->gp) -#define UPT_RBP(r) REGS_RBP((r)->gp) -#define UPT_RAX(r) REGS_RAX((r)->gp) #define UPT_R8(r) REGS_R8((r)->gp) #define UPT_R9(r) REGS_R9((r)->gp) #define UPT_R10(r) REGS_R10((r)->gp) @@ -105,51 +49,14 @@ struct uml_pt_regs { #define UPT_R13(r) REGS_R13((r)->gp) #define UPT_R14(r) REGS_R14((r)->gp) #define UPT_R15(r) REGS_R15((r)->gp) -#define UPT_CS(r) REGS_CS((r)->gp) -#define UPT_FS_BASE(r) REGS_FS_BASE((r)->gp) -#define UPT_FS(r) REGS_FS((r)->gp) -#define UPT_GS_BASE(r) REGS_GS_BASE((r)->gp) -#define UPT_GS(r) REGS_GS((r)->gp) -#define UPT_DS(r) REGS_DS((r)->gp) -#define UPT_ES(r) REGS_ES((r)->gp) -#define UPT_CS(r) REGS_CS((r)->gp) -#define UPT_SS(r) REGS_SS((r)->gp) -#define UPT_ORIG_RAX(r) REGS_ORIG_RAX((r)->gp) - -#define UPT_IP(r) REGS_IP((r)->gp) -#define UPT_SP(r) REGS_SP((r)->gp) - -#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) -#define UPT_SYSCALL_NR(r) ((r)->syscall) -#define UPT_SYSCALL_RET(r) UPT_RAX(r) - -extern int user_context(unsigned long sp); -#define UPT_IS_USER(r) ((r)->is_user) - -#define UPT_SYSCALL_ARG1(r) UPT_RDI(r) -#define UPT_SYSCALL_ARG2(r) UPT_RSI(r) -#define UPT_SYSCALL_ARG3(r) UPT_RDX(r) +#define UPT_SYSCALL_ARG1(r) UPT_DI(r) +#define UPT_SYSCALL_ARG2(r) UPT_SI(r) +#define UPT_SYSCALL_ARG3(r) UPT_DX(r) #define UPT_SYSCALL_ARG4(r) UPT_R10(r) #define UPT_SYSCALL_ARG5(r) UPT_R8(r) #define UPT_SYSCALL_ARG6(r) UPT_R9(r) -struct syscall_args { - unsigned long args[6]; -}; - -#define SYSCALL_ARGS(r) ((struct syscall_args) \ - { .args = { UPT_SYSCALL_ARG1(r), \ - UPT_SYSCALL_ARG2(r), \ - UPT_SYSCALL_ARG3(r), \ - UPT_SYSCALL_ARG4(r), \ - UPT_SYSCALL_ARG5(r), \ - UPT_SYSCALL_ARG6(r) } } ) - -#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) - -#define UPT_FAULTINFO(r) (&(r)->faultinfo) - static inline void arch_init_registers(int pid) { } diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 72eafa6c6a52..35b283d3df0c 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -413,9 +413,9 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig, PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; - PT_REGS_EAX(regs) = (unsigned long) sig; - PT_REGS_EDX(regs) = (unsigned long) 0; - PT_REGS_ECX(regs) = (unsigned long) 0; + PT_REGS_AX(regs) = (unsigned long) sig; + PT_REGS_DX(regs) = (unsigned long) 0; + PT_REGS_CX(regs) = (unsigned long) 0; if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); @@ -463,9 +463,9 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, PT_REGS_SP(regs) = (unsigned long) frame; PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; - PT_REGS_EAX(regs) = (unsigned long) sig; - PT_REGS_EDX(regs) = (unsigned long) &frame->info; - PT_REGS_ECX(regs) = (unsigned long) &frame->uc; + PT_REGS_AX(regs) = (unsigned long) sig; + PT_REGS_DX(regs) = (unsigned long) &frame->info; + PT_REGS_CX(regs) = (unsigned long) &frame->uc; if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) ptrace_notify(SIGTRAP); @@ -573,17 +573,17 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, } PT_REGS_SP(regs) = (unsigned long) frame; - PT_REGS_RDI(regs) = sig; + PT_REGS_DI(regs) = sig; /* In case the signal handler was declared without prototypes */ - PT_REGS_RAX(regs) = 0; + PT_REGS_AX(regs) = 0; /* * This also works for non SA_SIGINFO handlers because they expect the * next argument after the signal number on the stack. */ - PT_REGS_RSI(regs) = (unsigned long) &frame->info; - PT_REGS_RDX(regs) = (unsigned long) &frame->uc; - PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_SI(regs) = (unsigned long) &frame->info; + PT_REGS_DX(regs) = (unsigned long) &frame->uc; + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; out: return err; } diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c index 171b3e9dc867..2d5cc51e9bef 100644 --- a/arch/x86/um/sysrq_32.c +++ b/arch/x86/um/sysrq_32.c @@ -23,12 +23,10 @@ void show_regs(struct pt_regs *regs) printk(" EFLAGS: %08lx\n %s\n", PT_REGS_EFLAGS(regs), print_tainted()); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - PT_REGS_EAX(regs), PT_REGS_EBX(regs), - PT_REGS_ECX(regs), - PT_REGS_EDX(regs)); + PT_REGS_AX(regs), PT_REGS_BX(regs), + PT_REGS_CX(regs), PT_REGS_DX(regs)); printk("ESI: %08lx EDI: %08lx EBP: %08lx", - PT_REGS_ESI(regs), PT_REGS_EDI(regs), - PT_REGS_EBP(regs)); + PT_REGS_SI(regs), PT_REGS_DI(regs), PT_REGS_BP(regs)); printk(" DS: %04lx ES: %04lx\n", 0xffff & PT_REGS_DS(regs), 0xffff & PT_REGS_ES(regs)); diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c index e8913436d7dc..08258f179969 100644 --- a/arch/x86/um/sysrq_64.c +++ b/arch/x86/um/sysrq_64.c @@ -19,15 +19,15 @@ void __show_regs(struct pt_regs *regs) printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release); printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, - PT_REGS_RIP(regs)); + PT_REGS_IP(regs)); printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_SP(regs), PT_REGS_EFLAGS(regs)); printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", - PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs)); + PT_REGS_AX(regs), PT_REGS_BX(regs), PT_REGS_CX(regs)); printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", - PT_REGS_RDX(regs), PT_REGS_RSI(regs), PT_REGS_RDI(regs)); + PT_REGS_DX(regs), PT_REGS_SI(regs), PT_REGS_DI(regs)); printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", - PT_REGS_RBP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs)); + PT_REGS_BP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs)); printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs)); printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index c6c7131e563b..baba84f8ecb8 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -219,7 +219,7 @@ int arch_copy_tls(struct task_struct *new) int idx, ret = -EFAULT; if (copy_from_user(&info, - (void __user *) UPT_ESI(&new->thread.regs.regs), + (void __user *) UPT_SI(&new->thread.regs.regs), sizeof(info))) goto out; -- cgit v1.2.3 From 0088b6ec8fa4773dd56b861bfc1630f4c3c069db Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Apr 2012 03:28:20 -0400 Subject: um: stub_rt_sigsuspend isn't needed these days anymore Signed-off-by: Al Viro --- arch/x86/um/sys_call_table_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 9924776f4265..170bd926a69c 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -31,7 +31,6 @@ #define stub_fork sys_fork #define stub_vfork sys_vfork #define stub_execve sys_execve -#define stub_rt_sigsuspend sys_rt_sigsuspend #define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn -- cgit v1.2.3 From ffc51be82b17e1c515fdb2dd5b92605798216b30 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Apr 2012 16:34:27 -0400 Subject: um: missing checks of __put_user()/__get_user() return values Signed-off-by: Al Viro --- arch/x86/um/signal.c | 4 ++-- arch/x86/um/syscalls_32.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 35b283d3df0c..bb0fb03b9f85 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -544,8 +544,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig, set->sig[0]); err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + err |= __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); + err |= __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); } else err |= __copy_to_user(&frame->uc.uc_sigmask, set, diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c index 70ca357393b8..b853e8600b9d 100644 --- a/arch/x86/um/syscalls_32.c +++ b/arch/x86/um/syscalls_32.c @@ -44,10 +44,10 @@ long sys_sigaction(int sig, const struct old_sigaction __user *act, old_sigset_t mask; if (!access_ok(VERIFY_READ, act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || + __get_user(new_ka.sa.sa_flags, &act->sa_flags) || + __get_user(mask, &act->sa_mask)) return -EFAULT; - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); siginitset(&new_ka.sa.sa_mask, mask); } @@ -56,10 +56,10 @@ long sys_sigaction(int sig, const struct old_sigaction __user *act, if (!ret && oact) { if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || + __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); } return ret; -- cgit v1.2.3 From 764e0da14fd7ac2d259d98d34ece0a87d32306c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 May 2012 23:16:18 +0200 Subject: timers: Fixup the Kconfig consolidation fallout Sigh, I missed to check which architecture Kconfig files actually include the core Kconfig file. There are a few which did not. So we broke them. Instead of adding the includes to those, we are better off to move the include to init/Kconfig like we did already with irqs and others. This does not change anything for the architectures using the old style periodic timer mode. It just solves the build wreckage there. For those architectures which use the clock events infrastructure it moves the include of the core Kconfig file to "General setup" which is a way more logical place than having it at random locations specified by the architecture specific Kconfigs. Reported-by: Ingo Molnar Cc: Anna-Maria Gleixner Signed-off-by: Thomas Gleixner --- arch/arm/Kconfig | 2 -- arch/avr32/Kconfig | 2 -- arch/blackfin/Kconfig | 2 -- arch/c6x/Kconfig | 1 - arch/h8300/Kconfig.cpu | 2 -- arch/hexagon/Kconfig | 1 - arch/m68k/Kconfig | 4 --- arch/microblaze/Kconfig | 2 -- arch/mips/Kconfig | 2 -- arch/mn10300/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 2 -- arch/score/Kconfig | 1 - arch/sh/Kconfig | 3 -- arch/sparc/Kconfig | 2 -- arch/tile/Kconfig | 2 -- arch/um/Kconfig.um | 1 - arch/unicore32/Kconfig | 2 -- arch/x86/Kconfig | 2 -- init/Kconfig | 1 + kernel/time/Kconfig | 73 +++++++++++++++++++++++++++---------------------- 22 files changed, 42 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index feccc1d37ecf..c1e5f07fab93 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1459,8 +1459,6 @@ endmenu menu "Kernel Features" -source "kernel/time/Kconfig" - config HAVE_SMP bool help diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 0bd13ab9f43b..f8bc2d27d148 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -61,8 +61,6 @@ source "kernel/Kconfig.freezer" menu "System Type and features" -source "kernel/time/Kconfig" - config SUBARCH_AVR32B bool config MMU diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index bc21de2e8fed..f7897eefa630 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -631,8 +631,6 @@ config GPTMR0_CLOCKSOURCE depends on !TICKSOURCE_GPTMR0 endmenu -source kernel/time/Kconfig - comment "Misc" choice diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 30c04c658b9e..9d446eff2c04 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -132,7 +132,6 @@ source "mm/Kconfig" source "kernel/Kconfig.preempt" source "kernel/Kconfig.hz" -source "kernel/time/Kconfig" endmenu diff --git a/arch/h8300/Kconfig.cpu b/arch/h8300/Kconfig.cpu index 15c22286ae79..321f3922728b 100644 --- a/arch/h8300/Kconfig.cpu +++ b/arch/h8300/Kconfig.cpu @@ -1,7 +1,5 @@ menu "Processor type and features" -source "kernel/time/Kconfig" - choice prompt "H8/300 platform" default H8300H_GENERIC diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 7727ed9d2bf3..35f6c32d040c 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -183,7 +183,6 @@ endchoice source "mm/Kconfig" source "kernel/Kconfig.hz" -source "kernel/time/Kconfig" config GENERIC_GPIO bool "Generic GPIO support" diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 2f4b0f0610d6..cac5b6be572a 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -106,10 +106,6 @@ if COLDFIRE source "kernel/Kconfig.preempt" endif -if !MMU || COLDFIRE -source "kernel/time/Kconfig" -endif - source "mm/Kconfig" endmenu diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 3e786ac9a655..83460468998d 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -74,8 +74,6 @@ source "arch/microblaze/platform/Kconfig.platform" menu "Processor type and features" -source "kernel/time/Kconfig" - source "kernel/Kconfig.preempt" source "kernel/Kconfig.hz" diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index c9c330bc4e76..b65a730cba75 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2205,8 +2205,6 @@ config NR_CPUS performance should round up your number of processors to the next power of two. -source "kernel/time/Kconfig" - # # Timer Interrupt Frequency Configuration # diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 7f78057af2f5..687f9b4a2ed6 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -226,7 +226,6 @@ config MN10300_USING_JTAG single-stepping, which are taken over completely by the JTAG unit. source "kernel/Kconfig.hz" -source "kernel/time/Kconfig" config MN10300_RTC bool "Using MN10300 RTC" diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index be04485431fe..70653039e79b 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -106,7 +106,6 @@ config OPENRISC_HAVE_INST_DIV endmenu -source "kernel/time/Kconfig" source kernel/Kconfig.hz source kernel/Kconfig.preempt source "mm/Kconfig" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 901215f7a2f2..d47cf7ffa792 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -278,7 +278,6 @@ config HIGHMEM bool "High memory support" depends on PPC32 -source kernel/time/Kconfig source kernel/Kconfig.hz source kernel/Kconfig.preempt source "fs/Kconfig.binfmt" diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f9edb9303a7e..d0325d9ae21f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -131,8 +131,6 @@ menu "Base setup" comment "Processor type and features" -source "kernel/time/Kconfig" - config 64BIT def_bool y prompt "64 bit kernel" diff --git a/arch/score/Kconfig b/arch/score/Kconfig index f5d3b3237419..ba0f412920be 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -66,7 +66,6 @@ config MEMORY_START hex default 0xa0000000 -source "kernel/time/Kconfig" source "kernel/Kconfig.hz" source "kernel/Kconfig.preempt" diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index cffd8b0082d5..820dfe3c7b69 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -577,9 +577,6 @@ config SH_CLK_CPG_LEGACY depends on SH_CLK_CPG def_bool y if !CPU_SUBTYPE_SH7785 && !ARCH_SHMOBILE && \ !CPU_SHX3 && !CPU_SUBTYPE_SH7757 - -source "kernel/time/Kconfig" - endmenu menu "CPU Frequency scaling" diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 33399d3d90bc..b5a035a5c53a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -266,8 +266,6 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. -source "kernel/time/Kconfig" - if SPARC64 source "drivers/cpufreq/Kconfig" diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index b56772cac5d2..4eec3a1a72c0 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -136,8 +136,6 @@ config NR_CPUS smaller kernel memory footprint results from using a smaller value on chips with fewer tiles. -source "kernel/time/Kconfig" - source "kernel/Kconfig.hz" config KEXEC diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um index 70fd690964e4..bf87f25eb2de 100644 --- a/arch/um/Kconfig.um +++ b/arch/um/Kconfig.um @@ -10,7 +10,6 @@ config STATIC_LINK 2.75G) for UML. source "mm/Kconfig" -source "kernel/time/Kconfig" config LD_SCRIPT_STATIC bool diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index a25ca7606bea..47ad5210606f 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -143,8 +143,6 @@ endmenu menu "Kernel Features" -source "kernel/time/Kconfig" - source "kernel/Kconfig.preempt" source "kernel/Kconfig.hz" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3b0a9217836a..1b1e0493ef7f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -241,8 +241,6 @@ config ZONE_DMA If unsure, say Y. -source "kernel/time/Kconfig" - config SMP bool "Symmetric multi-processing support" ---help--- diff --git a/init/Kconfig b/init/Kconfig index 6cfd71d06463..528a0c4111cc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -387,6 +387,7 @@ config AUDIT_LOGINUID_IMMUTABLE but may not be backwards compatible with older init systems. source "kernel/irq/Kconfig" +source "kernel/time/Kconfig" menu "RCU Subsystem" diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6ebc4ff702a..fd42bd452b75 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -2,38 +2,6 @@ # Timer subsystem related configuration options # -# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is -# only related to the tick functionality. Oneshot clockevent devices -# are supported independ of this. -config TICK_ONESHOT - bool - -config NO_HZ - bool "Tickless System (Dynamic Ticks)" - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT - help - This option enables a tickless system: timer interrupts will - only trigger on an as-needed basis both when the system is - busy and when the system is idle. - -config HIGH_RES_TIMERS - bool "High Resolution Timer Support" - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT - help - This option enables high resolution timer support. If your - hardware is not capable then this option only increases - the size of the kernel image. - -config GENERIC_CLOCKEVENTS_BUILD - bool - default y - depends on GENERIC_CLOCKEVENTS - -config GENERIC_CLOCKEVENTS_MIN_ADJUST - bool - # Options selectable by arch Kconfig # Watchdog function for clocksources to detect instabilities @@ -60,11 +28,52 @@ config ARCH_USES_GETTIMEOFFSET config GENERIC_CLOCKEVENTS bool +# Migration helper. Builds, but does not invoke +config GENERIC_CLOCKEVENTS_BUILD + bool + default y + depends on GENERIC_CLOCKEVENTS + # Clockevents broadcasting infrastructure config GENERIC_CLOCKEVENTS_BROADCAST bool depends on GENERIC_CLOCKEVENTS +# Automatically adjust the min. reprogramming time for +# clock event device +config GENERIC_CLOCKEVENTS_MIN_ADJUST + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool + +if GENERIC_CLOCKEVENTS +menu "Timers subsystem" + +# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# only related to the tick functionality. Oneshot clockevent devices +# are supported independ of this. +config TICK_ONESHOT + bool + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +endmenu +endif -- cgit v1.2.3 From 68f3f16d9ad0f1e28ab3fd0001ab5798c41f15a3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 May 2012 21:42:32 -0400 Subject: new helper: sigsuspend() guts of saved_sigmask-based sigsuspend/rt_sigsuspend. Takes kernel sigset_t *. Open-coded instances replaced with calling it. Signed-off-by: Al Viro --- arch/alpha/kernel/signal.c | 11 +---------- arch/arm/kernel/signal.c | 11 +---------- arch/cris/arch-v10/kernel/signal.c | 16 ++++------------ arch/cris/arch-v32/kernel/signal.c | 16 ++++------------ arch/frv/kernel/signal.c | 14 +++----------- arch/m68k/kernel/signal.c | 15 +++------------ arch/mips/kernel/signal.c | 20 ++------------------ arch/mips/kernel/signal32.c | 20 ++------------------ arch/mips/kernel/signal_n32.c | 10 +--------- arch/mn10300/kernel/signal.c | 14 +++----------- arch/powerpc/kernel/signal_32.c | 11 +---------- arch/s390/kernel/signal.c | 9 +-------- arch/sh/kernel/signal_32.c | 12 +----------- arch/sparc/kernel/signal_32.c | 12 +----------- arch/sparc/kernel/signal_64.c | 13 +------------ arch/um/kernel/signal.c | 9 +-------- arch/x86/ia32/ia32_signal.c | 12 +----------- arch/x86/kernel/signal.c | 12 +----------- include/linux/signal.h | 1 + kernel/compat.c | 10 +--------- kernel/signal.c | 25 ++++++++++++++++--------- 21 files changed, 50 insertions(+), 223 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 35f2ef44de12..74b05e6ed441 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -121,17 +121,8 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act, SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage int diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 73d9a420850d..4e5fdd9bd9e3 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -67,17 +67,8 @@ const unsigned long syscall_restart_code[2] = { asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, old_sigset_t mask) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage int diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index 289c584ba499..170f4970d590 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -48,19 +48,11 @@ void do_signal(int canrestart, struct pt_regs *regs); * dummy arguments to be able to reach the regs argument. (Note that this * arrangement relies on old_sigset_t occupying one register.) */ -int sys_sigsuspend(old_sigset_t mask, long r11, long r12, long r13, long mof, - long srp, struct pt_regs *regs) +int sys_sigsuspend(old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); } int sys_sigaction(int sig, const struct old_sigaction __user *act, diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index ce4ab1a5552c..e09083208cb6 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -59,19 +59,11 @@ void keep_debug_flags(unsigned long oldccs, unsigned long oldspc, * dummy arguments to be able to reach the regs argument. */ int -sys_sigsuspend(old_sigset_t mask, long r11, long r12, long r13, long mof, - long srp, struct pt_regs *regs) +sys_sigsuspend(old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); } int diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index bab01298b58e..df957c7ba387 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -40,17 +40,9 @@ struct fdpic_func_descriptor { */ asmlinkage int sys_sigsuspend(int history0, int history1, old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); } asmlinkage int sys_sigaction(int sig, diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 1747c7030a33..8186982fb320 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -230,18 +230,9 @@ static inline void push_cache(unsigned long vaddr) asmlinkage int sys_sigsuspend(int unused0, int unused1, old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - - return -ERESTARTNOHAND; + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); } asmlinkage int diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index d5a338a1739c..17f6ee30ad0d 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -255,15 +255,7 @@ asmlinkage int sys_sigsuspend(nabi_no_regargs struct pt_regs regs) uset = (sigset_t __user *) regs.regs[4]; if (copy_from_user(&newset, uset, sizeof(sigset_t))) return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } #endif @@ -281,15 +273,7 @@ asmlinkage int sys_rt_sigsuspend(nabi_no_regargs struct pt_regs regs) unewset = (sigset_t __user *) regs.regs[4]; if (copy_from_user(&newset, unewset, sizeof(newset))) return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } #ifdef CONFIG_TRAD_SIGNALS diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index ac3b8d89aae5..b4fe2eacbd5d 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -288,15 +288,7 @@ asmlinkage int sys32_sigsuspend(nabi_no_regargs struct pt_regs regs) uset = (compat_sigset_t __user *) regs.regs[4]; if (get_sigset(&newset, uset)) return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } asmlinkage int sys32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs) @@ -313,15 +305,7 @@ asmlinkage int sys32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs) uset = (compat_sigset_t __user *) regs.regs[4]; if (get_sigset(&newset, uset)) return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } SYSCALL_DEFINE3(32_sigaction, long, sig, const struct sigaction32 __user *, act, diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index 86eb4b04631c..63ffac9af7c5 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -91,15 +91,7 @@ asmlinkage int sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs) if (copy_from_user(&uset, unewset, sizeof(uset))) return -EFAULT; sigset_from_compat(&newset, &uset); - sigdelsetmask(&newset, ~_BLOCKABLE); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs) diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index 690f4e9507d7..50eb94a05826 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -38,17 +38,9 @@ */ asmlinkage long sys_sigsuspend(int history0, int history1, old_sigset_t mask) { - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - current->saved_sigmask = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + sigset_t blocked; + siginitset(&blocked, mask); + return sigsuspend(&blocked); } /* diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 45eb998557f8..ac1f96027bf5 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -244,17 +244,8 @@ static inline int restore_general_regs(struct pt_regs *regs, long sys_sigsuspend(old_sigset_t mask) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } long sys_sigaction(int sig, struct old_sigaction __user *act, diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 8a4e2b760d56..f626232e216c 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -59,15 +59,8 @@ typedef struct SYSCALL_DEFINE3(sigsuspend, int, history0, int, history1, old_sigset_t, mask) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } SYSCALL_DEFINE3(sigaction, int, sig, const struct old_sigaction __user *, act, diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 5901fba3176e..46c9f9b00b14 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -58,18 +58,8 @@ sys_sigsuspend(old_sigset_t mask, struct pt_regs __regs) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage int diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index ac8e66b50f07..2b7e849f7c65 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -64,18 +64,8 @@ struct rt_signal_frame { static int _sigpause_common(old_sigset_t set) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - set &= _BLOCKABLE; siginitset(&blocked, set); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage int sys_sigsuspend(old_sigset_t set) diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 48b0f57b65f7..eafaab486b2d 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -242,19 +242,8 @@ struct rt_signal_frame { static long _sigpause_common(old_sigset_t set) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - set &= _BLOCKABLE; siginitset(&blocked, set); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - - set_restore_sigmask(); - - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage long sys_sigpause(unsigned int set) diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index fb12f4c5e649..b9b75b3bd5c9 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -152,15 +152,8 @@ int do_signal(void) long sys_sigsuspend(int history0, int history1, old_sigset_t mask) { sigset_t blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a69245ba27e3..fa54410c6666 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -127,18 +127,8 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 115eac431483..b68ccadd2ff4 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -478,18 +478,8 @@ asmlinkage int sys_sigsuspend(int history0, int history1, old_sigset_t mask) { sigset_t blocked; - - current->saved_sigmask = current->blocked; - - mask &= _BLOCKABLE; siginitset(&blocked, mask); - set_current_blocked(&blocked); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&blocked); } asmlinkage int diff --git a/include/linux/signal.h b/include/linux/signal.h index 7987ce74874b..17046cc484bc 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -252,6 +252,7 @@ extern int do_sigtimedwait(const sigset_t *, siginfo_t *, extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(const sigset_t *); extern int show_unhandled_signals; +extern int sigsuspend(sigset_t *); extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); extern void block_sigmask(struct k_sigaction *ka, int signr); diff --git a/kernel/compat.c b/kernel/compat.c index d2c67aa49ae6..c28a306ae05c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1073,15 +1073,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) return -EFAULT; sigset_from_compat(&newset, &newset32); - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ diff --git a/kernel/signal.c b/kernel/signal.c index 17afcaf582d0..3ad220a81619 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3236,6 +3236,21 @@ SYSCALL_DEFINE0(pause) #endif +#ifdef HAVE_SET_RESTORE_SIGMASK +int sigsuspend(sigset_t *set) +{ + sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + current->saved_sigmask = current->blocked; + set_current_blocked(set); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + return -ERESTARTNOHAND; +} +#endif + #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND /** * sys_rt_sigsuspend - replace the signal mask for a value with the @@ -3253,15 +3268,7 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) if (copy_from_user(&newset, unewset, sizeof(newset))) return -EFAULT; - sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); - - current->saved_sigmask = current->blocked; - set_current_blocked(&newset); - - current->state = TASK_INTERRUPTIBLE; - schedule(); - set_restore_sigmask(); - return -ERESTARTNOHAND; + return sigsuspend(&newset); } #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ -- cgit v1.2.3 From ea4d26ae24e58fbd2c61de9242adab053cb982d8 Mon Sep 17 00:00:00 2001 From: Jim Kukunas Date: Tue, 22 May 2012 13:54:04 +1000 Subject: raid5: add AVX optimized RAID5 checksumming Optimize RAID5 xor checksumming by taking advantage of 256-bit YMM registers introduced in AVX. Signed-off-by: Jim Kukunas Signed-off-by: NeilBrown --- arch/x86/Makefile | 5 +- arch/x86/include/asm/xor_32.h | 6 +- arch/x86/include/asm/xor_64.h | 8 +- arch/x86/include/asm/xor_avx.h | 214 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 229 insertions(+), 4 deletions(-) create mode 100644 arch/x86/include/asm/xor_avx.h (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 41a7237606a3..7a1cc9ee5c8a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) +avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 133b40a0f495..454570891bdc 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { .do_5 = xor_sse_5, }; +/* Also try the AVX routines */ +#include "xor_avx.h" + /* Also try the generic routines. */ #include @@ -871,6 +874,7 @@ do { \ xor_speed(&xor_block_8regs_p); \ xor_speed(&xor_block_32regs); \ xor_speed(&xor_block_32regs_p); \ + AVX_XOR_SPEED; \ if (cpu_has_xmm) \ xor_speed(&xor_block_pIII_sse); \ if (cpu_has_mmx) { \ @@ -883,6 +887,6 @@ do { \ We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) \ - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) #endif /* _ASM_X86_XOR_32_H */ diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1549b5e261f6..b9b2323e90fe 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = { .do_5 = xor_sse_5, }; + +/* Also try the AVX routines */ +#include "xor_avx.h" + #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ + AVX_XOR_SPEED; \ xor_speed(&xor_block_sse); \ } while (0) /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) +#define XOR_SELECT_TEMPLATE(FASTEST) \ + AVX_SELECT(&xor_block_sse) #endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h new file mode 100644 index 000000000000..2510d35f480e --- /dev/null +++ b/arch/x86/include/asm/xor_avx.h @@ -0,0 +1,214 @@ +#ifndef _ASM_X86_XOR_AVX_H +#define _ASM_X86_XOR_AVX_H + +/* + * Optimized RAID-5 checksumming functions for AVX + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas + * + * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifdef CONFIG_AS_AVX + +#include +#include + +#define ALIGN32 __aligned(32) + +#define YMM_SAVED_REGS 4 + +#define YMMS_SAVE \ +do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ + asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ + asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ + asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ +} while (0); + +#define YMMS_RESTORE \ +do { \ + asm volatile("sfence" : : : "memory"); \ + asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ + asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ + asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ + asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ + write_cr0(cr0); \ + preempt_enable(); \ +} while (0); + +#define BLOCK4(i) \ + BLOCK(32 * i, 0) \ + BLOCK(32 * (i + 1), 1) \ + BLOCK(32 * (i + 2), 2) \ + BLOCK(32 * (i + 3), 3) + +#define BLOCK16() \ + BLOCK4(0) \ + BLOCK4(4) \ + BLOCK4(8) \ + BLOCK4(12) + +static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16(); + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3, unsigned long *p4) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + p4 = (unsigned long *)((uintptr_t)p4 + 512); + } + + YMMS_RESTORE +} + +static struct xor_block_template xor_block_avx = { + .name = "avx", + .do_2 = xor_avx_2, + .do_3 = xor_avx_3, + .do_4 = xor_avx_4, + .do_5 = xor_avx_5, +}; + +#define AVX_XOR_SPEED \ +do { \ + if (cpu_has_avx) \ + xor_speed(&xor_block_avx); \ +} while (0) + +#define AVX_SELECT(FASTEST) \ + (cpu_has_avx ? &xor_block_avx : FASTEST) + +#else + +#define AVX_XOR_SPEED {} + +#define AVX_SELECT(FASTEST) (FASTEST) + +#endif +#endif -- cgit v1.2.3 From e8f380e00840f694599e6ab42806639f7de26f11 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 22 May 2012 12:53:45 +0200 Subject: x86/bitops: Move BIT_64() for a wider use Needed for shifting 64-bit values on 32-bit, like MSR values, for example. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Frank Arnold Link: http://lkml.kernel.org/r/1337684026-19740-1-git-send-email-bp@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bitops.h | 2 ++ drivers/edac/mce_amd.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index b97596e2b68c..a6983b277220 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -15,6 +15,8 @@ #include #include +#define BIT_64(n) (U64_C(1) << (n)) + /* * These have to be done with inline assembly: that way the bit-setting * is guaranteed to be atomic. All bit operations return 0 if the bit diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index c6074c5cd1ef..8c87a5e87057 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -5,8 +5,6 @@ #include -#define BIT_64(n) (U64_C(1) << (n)) - #define EC(x) ((x) & 0xffff) #define XEC(x, mask) (((x) >> 16) & mask) -- cgit v1.2.3 From 80f033610fb968e75f5d470233d8d0260d7a72ed Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 22 May 2012 12:53:46 +0200 Subject: x86/mce: Fix 32-bit build Got bitten again by the BIT() macro: arch/x86/kernel/cpu/mcheck/mce.c: In function '__mcheck_cpu_apply_quirks': arch/x86/kernel/cpu/mcheck/mce.c:1453:6: warning: left shift count >= width of type arch/x86/kernel/cpu/mcheck/mce.c:1454:7: warning: left shift count >= width of type Fix it already. Signed-off-by: Borislav Petkov Cc: Frank Arnold Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1337684026-19740-2-git-send-email-bp@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 888fbf9d0adf..0456b9a08086 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1450,9 +1450,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) rdmsrl(msrs[i], val); /* CntP bit set? */ - if (val & BIT(62)) { - val &= ~BIT(62); - wrmsrl(msrs[i], val); + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); } } -- cgit v1.2.3 From fd952815307f0f272bf49fd364a7fd2f9992bc42 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 23 May 2012 14:02:34 -0700 Subject: x86-32, relocs: Whitelist more symbols for ld bug workaround As noted in checkin: a3e854d95 x86, relocs: Workaround for binutils 2.22.52.0.1 section bug ld version 2.22.52.0.[12] can incorrectly promote relative symbols to absolute, if the output section they appear in is otherwise empty. Since checkin: 6520fe55 x86, realmode: 16-bit real-mode code support for relocs tool we actually check for this and error out rather than silently creating a kernel which will malfunction if relocated. Ingo found a configuration in which __start_builtin_fw triggered the warning. Go through the linker script sources and look for more symbols that could plausibly get bogusly promoted to absolute, and add them to the whitelist. In general, if the following error triggers: Invalid absolute R_386_32 relocation: ... then we should verify that is really meant to be relocated, and add it and any related symbols manually to the S_REL regexp. Please note that 6520fe55 does not introduce the error, only the check for the error -- without 6520fe55 this version of ld will simply produce a corrupt kernel if CONFIG_RELOCATABLE is set on x86-32. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: v3.4 --- arch/x86/tools/relocs.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b43cfcd9bf40..b8f7c65fc40c 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -60,6 +60,17 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__x86_cpu_dev_(start|end)|" "(__parainstructions|__alt_instructions)(|_end)|" "(__iommu_table|__apicdrivers|__smp_locks)(|_end)|" + "__(start|end)_pci_.*|" + "__(start|end)_builtin_fw|" + "__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|" + "__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|" + "__(start|stop)___param|" + "__(start|stop)___modver|" + "__(start|stop)___bug_table|" + "__tracedata_(start|end)|" + "__(start|stop)_notes|" + "__end_rodata|" + "__initramfs_start|" "_end)$" }; -- cgit v1.2.3 From a129a7c84582629741e5fa6f40026efcd7a65bd4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 19 Nov 2010 13:16:22 +0100 Subject: MCE: Fix vm86 handling for 32bit mce handler When running on 32bit the mce handler could misinterpret vm86 mode as ring 0. This can affect whether it does recovery or not; it was possible to panic when recovery was actually possible. Fix this by always forcing vm86 to look like ring 3. Signed-off-by: Andi Kleen Cc: Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 66e1c51be084..5f793e6c854b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -437,6 +437,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { m->ip = regs->ip; m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; } /* Use accurate RIP reporting if available. */ if (rip_msr) -- cgit v1.2.3 From 875e26648cf9b6db9d8dc07b7959d7c61fb3f49c Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 23 May 2012 14:14:22 -0700 Subject: x86/mce: Fix check for processor context when machine check was taken. Linus pointed out that there was no value is checking whether m->ip was zero - because zero is a legimate value. If we have a reliable (or faked in the VM86 case) "m->cs" we can use it to tell whether we were in user mode or kernelwhen the machine check hit. Reported-by: Linus Torvalds Cc: Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 0c82091b1652..1ccd453903d8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -165,15 +165,19 @@ static struct severity { }; /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. */ static int error_context(struct mce *m) { - if (m->mcgstatus & MCG_STATUS_EIPV) - return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; - /* Unknown, assume kernel */ - return IN_KERNEL; + return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } int mce_severity(struct mce *m, int tolerant, char **msg) -- cgit v1.2.3 From 37c3459b67dd5a396a968e819cf4a86d24ac9ace Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 10 May 2012 11:12:14 -0700 Subject: x86/mce: Add instruction recovery signatures to mce-severity table Instruction recovery cases are very similar to the data recovery one we already have. Just trade out for a new MCACOD value. Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 1ccd453903d8..413c2ced887c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -126,6 +126,16 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER ), + MCESEV( + KEEP, "HT thread notices Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", -- cgit v1.2.3 From a42c6ded827dbd396d2efde7530620be029a72d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 May 2012 14:44:37 -0400 Subject: move key_repace_session_keyring() into tracehook_notify_resume() Signed-off-by: Al Viro --- arch/alpha/kernel/signal.c | 2 -- arch/arm/kernel/signal.c | 2 -- arch/avr32/kernel/signal.c | 2 -- arch/blackfin/kernel/signal.c | 2 -- arch/c6x/kernel/signal.c | 2 -- arch/cris/kernel/ptrace.c | 2 -- arch/frv/kernel/signal.c | 2 -- arch/h8300/kernel/signal.c | 2 -- arch/hexagon/kernel/signal.c | 2 -- arch/ia64/kernel/process.c | 2 -- arch/m32r/kernel/signal.c | 2 -- arch/m68k/kernel/signal.c | 5 +---- arch/microblaze/kernel/signal.c | 5 +---- arch/mips/kernel/signal.c | 2 -- arch/mn10300/kernel/signal.c | 2 -- arch/openrisc/kernel/signal.c | 2 -- arch/parisc/kernel/signal.c | 2 -- arch/powerpc/kernel/signal.c | 2 -- arch/s390/kernel/signal.c | 2 -- arch/score/kernel/signal.c | 2 -- arch/sh/kernel/signal_32.c | 2 -- arch/sh/kernel/signal_64.c | 2 -- arch/sparc/kernel/signal_32.c | 2 -- arch/sparc/kernel/signal_64.c | 2 -- arch/tile/kernel/process.c | 2 -- arch/um/kernel/process.c | 5 +---- arch/unicore32/kernel/signal.c | 2 -- arch/x86/kernel/signal.c | 2 -- arch/xtensa/kernel/signal.c | 5 +---- include/linux/tracehook.h | 2 ++ 30 files changed, 6 insertions(+), 66 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 10ab2d74ecbb..f6db3032ddf0 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -590,7 +590,5 @@ do_notify_resume(struct pt_regs *regs, struct switch_stack *sw, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 4e5fdd9bd9e3..ec640412aed0 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -728,7 +728,5 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags, int syscall) if (thread_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index ae386c304bee..e7595ef74f51 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -321,7 +321,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti) if (ti->flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index e5bbc1a5edc2..fc9ecce8b6ce 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -336,8 +336,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs) if (test_thread_flag(TIF_NOTIFY_RESUME)) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index cf37478c1169..9493f0bbf0a6 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -364,7 +364,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags, if (thread_info_flags & (1 << TIF_NOTIFY_RESUME)) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c index d114ad3da9b1..58d44ee1a71f 100644 --- a/arch/cris/kernel/ptrace.c +++ b/arch/cris/kernel/ptrace.c @@ -40,7 +40,5 @@ void do_notify_resume(int canrestart, struct pt_regs *regs, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 8cf5dca01758..595bf1e5a5dc 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -562,8 +562,6 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(__frame); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } /* end do_notify_resume() */ diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index d4b0555d2904..e58992ad789e 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -513,7 +513,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index 434866eb0f1c..21a3018cb9bf 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -273,8 +273,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 5e0e86ddb12f..dd6fc1449741 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -199,8 +199,6 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) if (test_thread_flag(TIF_NOTIFY_RESUME)) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(&scr->pt); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } /* copy user rbs to kernel rbs */ diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index f54d96993ea1..64804f1f5141 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -383,8 +383,6 @@ void do_notify_resume(struct pt_regs *regs, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } clear_thread_flag(TIF_IRET); diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index d9f3d1900eed..973eec60cad4 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1193,9 +1193,6 @@ void do_notify_resume(struct pt_regs *regs) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) { + if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); - } } diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 7f4c7bef1642..5d796e32786e 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -401,9 +401,6 @@ void do_notify_resume(struct pt_regs *regs, int in_syscall) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs, in_syscall); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) { + if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); - } } diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 17f6ee30ad0d..8a6e6d116ab0 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -636,8 +636,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index 890cf91767cc..b8b6aa1a6837 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -554,7 +554,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(current_frame()); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index e970743251ae..9ae611522953 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -376,7 +376,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs) if (current_thread_info()->flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 4b9cb0d546d1..e7a7cd3e1120 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -638,7 +638,5 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall) if (test_thread_flag(TIF_NOTIFY_RESUME)) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 651c5963662b..bfc3ec1382fb 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -193,8 +193,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index f626232e216c..42a6e8b47f06 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -494,6 +494,4 @@ void do_notify_resume(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index d4a49011c48a..302838d3acf6 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -356,7 +356,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index cb4172c8af7d..9d7bfd66f189 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -626,7 +626,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index b589a354c069..aa6428430842 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -685,7 +685,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 2b7e849f7c65..6b42e8622d12 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -590,8 +590,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index eafaab486b2d..c82cf1cc3965 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -607,8 +607,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index f572c19c4082..32817ab6062a 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -569,8 +569,6 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); return 1; } if (thread_info_flags & _TIF_SINGLESTEP) { diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 3a2235e0abc3..ccb9a9d283f1 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -117,11 +117,8 @@ void interrupt_end(void) schedule(); if (test_thread_flag(TIF_SIGPENDING)) do_signal(); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) { + if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(¤t->thread.regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); - } } void exit_thread(void) diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index 7754df6ef7d4..28782ad47b93 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -464,8 +464,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, if (thread_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b68ccadd2ff4..9363b58b967c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -821,8 +821,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); } if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index c5e4ec0598d2..ea7e17778a75 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -548,9 +548,6 @@ void do_notify_resume(struct pt_regs *regs) if (test_thread_flag(TIF_SIGPENDING)) do_signal(regs); - if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) { + if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) tracehook_notify_resume(regs); - if (current->replacement_session_keyring) - key_replace_session_keyring(); - } } diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 8a2a3fc9bd05..b9ca903bb553 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -183,6 +183,8 @@ static inline void set_notify_resume(struct task_struct *task) */ static inline void tracehook_notify_resume(struct pt_regs *regs) { + if (current->replacement_session_keyring) + key_replace_session_keyring(); } #endif /* */ -- cgit v1.2.3 From ea17e7414bc62e8d3bde8d08e3df1d921c518c17 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 24 May 2012 07:01:38 -0700 Subject: x86, relocs: Add jiffies and jiffies_64 to the relative whitelist The symbol jiffies is created in the linker script as an alias to jiffies_64. Unfortunately this is done outside any section, and apparently GNU ld 2.21 doesn't carry the section with it, so we end up with an absolute symbol and therefore a broken kernel. Add jiffies and jiffies_64 to the whitelist. The most disturbing bit with this discovery is that it shows that we have had multiple linker bugs in this area crossing multiple generations, and have been silently building bad kernels for some time. Link: http://lkml.kernel.org/r/20120524171604.0d98284f3affc643e9714470@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: H. Peter Anvin Cc: v3.4 --- arch/x86/tools/relocs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b8f7c65fc40c..b685296d4464 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -71,6 +71,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = { "__(start|stop)_notes|" "__end_rodata|" "__initramfs_start|" + "(jiffies|jiffies_64)|" "_end)$" }; -- cgit v1.2.3 From 446969084d33a4064a39d280806da642c54ba4ac Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 23 May 2012 20:12:50 -0700 Subject: kernel: Move REPEAT_BYTE definition into linux/kernel.h And make sure that everything using it explicitly includes that header file. Signed-off-by: David S. Miller --- arch/sparc/lib/usercopy.c | 3 +-- arch/x86/include/asm/word-at-a-time.h | 4 ++-- fs/namei.c | 1 + include/linux/kernel.h | 2 ++ 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/sparc/lib/usercopy.c b/arch/sparc/lib/usercopy.c index f61ed820cb61..0b12e91d6ccc 100644 --- a/arch/sparc/lib/usercopy.c +++ b/arch/sparc/lib/usercopy.c @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -11,8 +12,6 @@ void copy_from_user_overflow(void) } EXPORT_SYMBOL(copy_from_user_overflow); -#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) - static inline long find_zero(unsigned long mask) { long byte = 0; diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index e58f03b206c3..ae03facfadd6 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -1,6 +1,8 @@ #ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H +#include + /* * This is largely generic for little-endian machines, but the * optimal byte mask counting is probably going to be something @@ -35,8 +37,6 @@ static inline long count_masked_bytes(long mask) #endif -#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) - /* Return the high bit set in the first byte that is a zero */ static inline unsigned long has_zero(unsigned long a) { diff --git a/fs/namei.c b/fs/namei.c index f9e883c1b856..8d2ba420e42f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 645231c373c8..fbe9bfacb8db 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -38,6 +38,8 @@ #define STACK_MAGIC 0xdeadbeef +#define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) + #define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) #define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) #define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -- cgit v1.2.3 From 1b38a3a10f2ad96a3c0130f63b7f3610bab7090d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 25 May 2012 11:40:09 +0100 Subject: x86: hpet: Fix copy-and-paste mistake in earlier change This fixes an oversight in 396e2c6fed4ff13b53ce0e573105531cf53b0cad ("x86: Clear HPET configuration registers on startup"), noticed by Thomas Gleixner. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4FBF7DA902000078000861EE@nat28.tlf.novell.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 9cc7b4392f7c..1460a5df92f7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -870,7 +870,7 @@ int __init hpet_enable(void) else pr_warn("HPET initial state will not be saved\n"); cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_Tn_CFG(i)); + hpet_writel(cfg, HPET_CFG); if (cfg) pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", cfg); -- cgit v1.2.3 From 4ae73f2d53255c388d50bf83c1681112a6f9cba1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 May 2012 10:14:39 -0700 Subject: x86: use generic strncpy_from_user routine The generic strncpy_from_user() is not really optimal, since it is designed to work on both little-endian and big-endian. And on little-endian you can simplify much of the logic to find the first zero byte, since little-endian arithmetic doesn't have to worry about the carry bit propagating into earlier bytes (only later bytes, which we don't care about). But I have patches to make the generic routines use the architecture- specific infrastructure, so that we can regain the little-endian optimizations. But before we do that, switch over to the generic routines to make the patches each do just one well-defined thing. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 1 + arch/x86/lib/usercopy.c | 97 ------------------------------------------ 3 files changed, 2 insertions(+), 97 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 81c3e8be789a..3220d44e24d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_TIME_VSYSCALL if X86_64 select KTIME_SCALAR if X86_32 + select GENERIC_STRNCPY_FROM_USER config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS || UPROBES) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 851fe0dc13bc..1354facd8f63 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -32,6 +32,7 @@ #define segment_eq(a, b) ((a).seg == (b).seg) +#define user_addr_max() (current_thread_info()->addr_limit.seg) #define __addr_ok(addr) \ ((unsigned long __force)(addr) < \ (current_thread_info()->addr_limit.seg)) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 2e4e4b02c37a..f61ee67ec00f 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -43,100 +43,3 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); - -/* - * Do a strncpy, return length of string without final '\0'. - * 'count' is the user-supplied count (return 'count' if we - * hit it), 'max' is the address space maximum (and we return - * -EFAULT if we hit it). - */ -static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max) -{ - long res = 0; - - /* - * Truncate 'max' to the user-specified limit, so that - * we only have one limit we need to check in the loop - */ - if (max > count) - max = count; - - while (max >= sizeof(unsigned long)) { - unsigned long c, mask; - - /* Fall back to byte-at-a-time if we get a page fault */ - if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) - break; - mask = has_zero(c); - if (mask) { - mask = (mask - 1) & ~mask; - mask >>= 7; - *(unsigned long *)(dst+res) = c & mask; - return res + count_masked_bytes(mask); - } - *(unsigned long *)(dst+res) = c; - res += sizeof(unsigned long); - max -= sizeof(unsigned long); - } - - while (max) { - char c; - - if (unlikely(__get_user(c,src+res))) - return -EFAULT; - dst[res] = c; - if (!c) - return res; - res++; - max--; - } - - /* - * Uhhuh. We hit 'max'. But was that the user-specified maximum - * too? If so, that's ok - we got as much as the user asked for. - */ - if (res >= count) - return res; - - /* - * Nope: we hit the address space limit, and we still had more - * characters the caller would have wanted. That's an EFAULT. - */ - return -EFAULT; -} - -/** - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long -strncpy_from_user(char *dst, const char __user *src, long count) -{ - unsigned long max_addr, src_addr; - - if (unlikely(count <= 0)) - return 0; - - max_addr = current_thread_info()->addr_limit.seg; - src_addr = (unsigned long)src; - if (likely(src_addr < max_addr)) { - unsigned long max = max_addr - src_addr; - return do_strncpy_from_user(dst, src, count, max); - } - return -EFAULT; -} -EXPORT_SYMBOL(strncpy_from_user); -- cgit v1.2.3 From 36126f8f2ed8168eb13aa0662b9b9585cba100a9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 May 2012 10:43:17 -0700 Subject: word-at-a-time: make the interfaces truly generic This changes the interfaces in to be a bit more complicated, but a lot more generic. In particular, it allows us to really do the operations efficiently on both little-endian and big-endian machines, pretty much regardless of machine details. For example, if you can rely on a fast population count instruction on your architecture, this will allow you to make your optimized file with that. NOTE! The "generic" version in include/asm-generic/word-at-a-time.h is not truly generic, it actually only works on big-endian. Why? Because on little-endian the generic algorithms are wasteful, since you can inevitably do better. The x86 implementation is an example of that. (The only truly non-generic part of the asm-generic implementation is the "find_zero()" function, and you could make a little-endian version of it. And if the Kbuild infrastructure allowed us to pick a particular header file, that would be lovely) The functions are as follows: - WORD_AT_A_TIME_CONSTANTS: specific constants that the algorithm uses. - has_zero(): take a word, and determine if it has a zero byte in it. It gets the word, the pointer to the constant pool, and a pointer to an intermediate "data" field it can set. This is the "quick-and-dirty" zero tester: it's what is run inside the hot loops. - "prep_zero_mask()": take the word, the data that has_zero() produced, and the constant pool, and generate an *exact* mask of which byte had the first zero. This is run directly *outside* the loop, and allows the "has_zero()" function to answer the "is there a zero byte" question without necessarily getting exactly *which* byte is the first one to contain a zero. If you do multiple byte lookups concurrently (eg "hash_name()", which looks for both NUL and '/' bytes), after you've done the prep_zero_mask() phase, the result of those can be or'ed together to get the "either or" case. - The result from "prep_zero_mask()" can then be fed into "find_zero()" (to find the byte offset of the first byte that was zero) or into "zero_bytemask()" (to find the bytemask of the bytes preceding the zero byte). The existence of zero_bytemask() is optional, and is not necessary for the normal string routines. But dentry name hashing needs it, so if you enable DENTRY_WORD_AT_A_TIME you need to expose it. This changes the generic strncpy_from_user() function and the dentry hashing functions to use these modified word-at-a-time interfaces. This gets us back to the optimized state of the x86 strncpy that we lost in the previous commit when moving over to the generic version. Signed-off-by: Linus Torvalds --- arch/openrisc/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/x86/include/asm/word-at-a-time.h | 32 +++++++++++++++++++-- fs/namei.c | 22 ++++++++------- include/asm-generic/word-at-a-time.h | 52 +++++++++++++++++++++++++++++++++++ lib/strncpy_from_user.c | 47 +++++-------------------------- 6 files changed, 102 insertions(+), 53 deletions(-) create mode 100644 include/asm-generic/word-at-a-time.h (limited to 'arch/x86') diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index c936483bc8e2..3f35c38d7b64 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -66,3 +66,4 @@ generic-y += topology.h generic-y += types.h generic-y += ucontext.h generic-y += user.h +generic-y += word-at-a-time.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 2c2e38821f60..67f83e0a0d68 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -21,3 +21,4 @@ generic-y += div64.h generic-y += local64.h generic-y += irq_regs.h generic-y += local.h +generic-y += word-at-a-time.h diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h index ae03facfadd6..5b238981542a 100644 --- a/arch/x86/include/asm/word-at-a-time.h +++ b/arch/x86/include/asm/word-at-a-time.h @@ -10,6 +10,11 @@ * bit count instruction, that might be better than the multiply * and shift, for example. */ +struct word_at_a_time { + const unsigned long one_bits, high_bits; +}; + +#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } #ifdef CONFIG_64BIT @@ -37,10 +42,31 @@ static inline long count_masked_bytes(long mask) #endif -/* Return the high bit set in the first byte that is a zero */ -static inline unsigned long has_zero(unsigned long a) +/* Return nonzero if it has a zero */ +static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) +{ + unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; + *bits = mask; + return mask; +} + +static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) +{ + return bits; +} + +static inline unsigned long create_zero_mask(unsigned long bits) +{ + bits = (bits - 1) & ~bits; + return bits >> 7; +} + +/* The mask we created is directly usable as a bytemask */ +#define zero_bytemask(mask) (mask) + +static inline unsigned long find_zero(unsigned long mask) { - return ((a - REPEAT_BYTE(0x01)) & ~a) & REPEAT_BYTE(0x80); + return count_masked_bytes(mask); } /* diff --git a/fs/namei.c b/fs/namei.c index 93ff12b1a1de..c651f02c9fec 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1452,7 +1452,8 @@ EXPORT_SYMBOL(full_name_hash); */ static inline unsigned long hash_name(const char *name, unsigned int *hashp) { - unsigned long a, mask, hash, len; + unsigned long a, b, adata, bdata, mask, hash, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; hash = a = 0; len = -sizeof(unsigned long); @@ -1460,17 +1461,18 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) hash = (hash + a) * 9; len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); - /* Do we have any NUL or '/' bytes in this word? */ - mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/')); - } while (!mask); - - /* The mask *below* the first high bit set */ - mask = (mask - 1) & ~mask; - mask >>= 7; - hash += a & mask; + b = a ^ REPEAT_BYTE('/'); + } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); + + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + + mask = create_zero_mask(adata | bdata); + + hash += a & zero_bytemask(mask); *hashp = fold_hash(hash); - return len + count_masked_bytes(mask); + return len + find_zero(mask); } #else diff --git a/include/asm-generic/word-at-a-time.h b/include/asm-generic/word-at-a-time.h new file mode 100644 index 000000000000..3f21f1b72e45 --- /dev/null +++ b/include/asm-generic/word-at-a-time.h @@ -0,0 +1,52 @@ +#ifndef _ASM_WORD_AT_A_TIME_H +#define _ASM_WORD_AT_A_TIME_H + +/* + * This says "generic", but it's actually big-endian only. + * Little-endian can use more efficient versions of these + * interfaces, see for example + * arch/x86/include/asm/word-at-a-time.h + * for those. + */ + +#include + +struct word_at_a_time { + const unsigned long high_bits, low_bits; +}; + +#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0xfe) + 1, REPEAT_BYTE(0x7f) } + +/* Bit set in the bytes that have a zero */ +static inline long prep_zero_mask(unsigned long val, unsigned long rhs, const struct word_at_a_time *c) +{ + unsigned long mask = (val & c->low_bits) + c->low_bits; + return ~(mask | rhs); +} + +#define create_zero_mask(mask) (mask) + +static inline long find_zero(unsigned long mask) +{ + long byte = 0; +#ifdef CONFIG_64BIT + if (mask >> 32) + mask >>= 32; + else + byte = 4; +#endif + if (mask >> 16) + mask >>= 16; + else + byte += 2; + return (mask >> 8) ? byte : byte + 1; +} + +static inline bool has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) +{ + unsigned long rhs = val | c->low_bits; + *data = rhs; + return (val + c->high_bits) & ~rhs; +} + +#endif /* _ASM_WORD_AT_A_TIME_H */ diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index c4c09b0e96ba..bb2b201d6ad0 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -4,37 +4,7 @@ #include #include - -static inline long find_zero(unsigned long mask) -{ - long byte = 0; - -#ifdef __BIG_ENDIAN -#ifdef CONFIG_64BIT - if (mask >> 32) - mask >>= 32; - else - byte = 4; -#endif - if (mask >> 16) - mask >>= 16; - else - byte += 2; - return (mask >> 8) ? byte : byte + 1; -#else -#ifdef CONFIG_64BIT - if (!((unsigned int) mask)) { - mask >>= 32; - byte = 4; - } -#endif - if (!(mask & 0xffff)) { - mask >>= 16; - byte += 2; - } - return (mask & 0xff) ? byte : byte + 1; -#endif -} +#include #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS #define IS_UNALIGNED(src, dst) 0 @@ -51,8 +21,7 @@ static inline long find_zero(unsigned long mask) */ static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max) { - const unsigned long high_bits = REPEAT_BYTE(0xfe) + 1; - const unsigned long low_bits = REPEAT_BYTE(0x7f); + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; long res = 0; /* @@ -66,18 +35,16 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long goto byte_at_a_time; while (max >= sizeof(unsigned long)) { - unsigned long c, v, rhs; + unsigned long c, data; /* Fall back to byte-at-a-time if we get a page fault */ if (unlikely(__get_user(c,(unsigned long __user *)(src+res)))) break; - rhs = c | low_bits; - v = (c + high_bits) & ~rhs; *(unsigned long *)(dst+res) = c; - if (v) { - v = (c & low_bits) + low_bits; - v = ~(v | rhs); - return res + find_zero(v); + if (has_zero(c, &data, &constants)) { + data = prep_zero_mask(c, data, &constants); + data = create_zero_mask(data); + return res + find_zero(data); } res += sizeof(unsigned long); max -= sizeof(unsigned long); -- cgit v1.2.3 From 5723aa993d83803157c22327e90cd59e3dcbe879 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 May 2012 11:09:53 -0700 Subject: x86: use the new generic strnlen_user() function This throws away the old x86-specific functions in favor of the generic optimized version. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/uaccess.h | 3 +++ arch/x86/include/asm/uaccess_32.h | 17 -------------- arch/x86/include/asm/uaccess_64.h | 3 --- arch/x86/lib/usercopy_32.c | 41 --------------------------------- arch/x86/lib/usercopy_64.c | 48 --------------------------------------- 6 files changed, 4 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3220d44e24d0..d700811785ea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,6 +94,7 @@ config X86 select GENERIC_TIME_VSYSCALL if X86_64 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER + select GENERIC_STRNLEN_USER config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS || UPROBES) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1354facd8f63..04cd6882308e 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -566,6 +566,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); +extern __must_check long strlen_user(const char __user *str); +extern __must_check long strnlen_user(const char __user *str, long n); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 8084bc73b18c..576e39bca6ad 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -213,23 +213,6 @@ static inline unsigned long __must_check copy_from_user(void *to, return n; } -/** - * strlen_user: - Get the size of a string in user space. - * @str: The string to measure. - * - * Context: User context only. This function may sleep. - * - * Get the size of a NUL-terminated string in user space. - * - * Returns the size of the string INCLUDING the terminating NUL. - * On exception, returns 0. - * - * If there is a limit on the length of a valid string, you may wish to - * consider using strnlen_user() instead. - */ -#define strlen_user(str) strnlen_user(str, LONG_MAX) - -long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index fcd4b6f3ef02..8e796fbbf9c6 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -208,9 +208,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) } } -__must_check long strnlen_user(const char __user *str, long n); -__must_check long __strnlen_user(const char __user *str, long n); -__must_check long strlen_user(const char __user *str); __must_check unsigned long clear_user(void __user *mem, unsigned long len); __must_check unsigned long __clear_user(void __user *mem, unsigned long len); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 883b216c60b2..1781b2f950e2 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -95,47 +95,6 @@ __clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(__clear_user); -/** - * strnlen_user: - Get the size of a string in user space. - * @s: The string to measure. - * @n: The maximum valid length - * - * Get the size of a NUL-terminated string in user space. - * - * Returns the size of the string INCLUDING the terminating NUL. - * On exception, returns 0. - * If the string is too long, returns a value greater than @n. - */ -long strnlen_user(const char __user *s, long n) -{ - unsigned long mask = -__addr_ok(s); - unsigned long res, tmp; - - might_fault(); - - __asm__ __volatile__( - " testl %0, %0\n" - " jz 3f\n" - " andl %0,%%ecx\n" - "0: repne; scasb\n" - " setne %%al\n" - " subl %%ecx,%0\n" - " addl %0,%%eax\n" - "1:\n" - ".section .fixup,\"ax\"\n" - "2: xorl %%eax,%%eax\n" - " jmp 1b\n" - "3: movb $1,%%al\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(0b,2b) - :"=&r" (n), "=&D" (s), "=&a" (res), "=&c" (tmp) - :"0" (n), "1" (s), "2" (0), "3" (mask) - :"cc"); - return res & mask; -} -EXPORT_SYMBOL(strnlen_user); - #ifdef CONFIG_X86_INTEL_USERCOPY static unsigned long __copy_user_intel(void __user *to, const void *from, unsigned long size) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0d0326f388c0..e5b130bc2d0e 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -52,54 +52,6 @@ unsigned long clear_user(void __user *to, unsigned long n) } EXPORT_SYMBOL(clear_user); -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ - -long __strnlen_user(const char __user *s, long n) -{ - long res = 0; - char c; - - while (1) { - if (res>n) - return n+1; - if (__get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(__strnlen_user); - -long strnlen_user(const char __user *s, long n) -{ - if (!access_ok(VERIFY_READ, s, 1)) - return 0; - return __strnlen_user(s, n); -} -EXPORT_SYMBOL(strnlen_user); - -long strlen_user(const char __user *s) -{ - long res = 0; - char c; - - for (;;) { - if (get_user(c, s)) - return 0; - if (!c) - return res+1; - res++; - s++; - } -} -EXPORT_SYMBOL(strlen_user); - unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) { if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) { -- cgit v1.2.3 From c35866678391861942b3836c219a8898a259255a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 28 May 2012 14:10:43 +0800 Subject: KVM: MMU: fix huge page adapted on non-PAE host The huge page size is 4M on non-PAE host, but 2M page size is used in transparent_hugepage_adjust(), so the page we get after adjust the mapping level is not the head page, the BUG_ON() will be triggered Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 72102e0ab7cb..be3cea4407ff 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2595,8 +2595,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, *gfnp = gfn; kvm_release_pfn_clean(pfn); pfn &= ~mask; - if (!get_page_unless_zero(pfn_to_page(pfn))) - BUG(); + kvm_get_pfn(pfn); *pfnp = pfn; } } -- cgit v1.2.3 From 91eb0f67c38c7104766faa49c5aaee2b4876511e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 29 May 2012 15:06:28 -0700 Subject: x86: print e820 physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. For example: -BIOS-provided physical RAM map: +e820: BIOS-provided physical RAM map: - BIOS-e820: 0000000000000100 - 000000000009e000 (usable) +BIOS-e820: [mem 0x0000000000000100-0x000000000009dfff] usable -Allocating PCI resources starting at 90000000 (gap: 90000000:6ed1c000) +e820: [mem 0x90000000-0xfed1bfff] available for PCI devices -reserve RAM buffer: 000000000009e000 - 000000000009ffff +e820: reserve RAM buffer [mem 0x0009e000-0x0009ffff] Signed-off-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Konrad Rzeszutek Wilk Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 53 +++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 62d61e9976eb..41857970517f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, int x = e820x->nr_map; if (x >= ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long) start, + (unsigned long long) (start + size - 1)); return; } @@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type) switch (type) { case E820_RAM: case E820_RESERVED_KERN: - printk(KERN_CONT "(usable)"); + printk(KERN_CONT "usable"); break; case E820_RESERVED: - printk(KERN_CONT "(reserved)"); + printk(KERN_CONT "reserved"); break; case E820_ACPI: - printk(KERN_CONT "(ACPI data)"); + printk(KERN_CONT "ACPI data"); break; case E820_NVS: - printk(KERN_CONT "(ACPI NVS)"); + printk(KERN_CONT "ACPI NVS"); break; case E820_UNUSABLE: - printk(KERN_CONT "(unusable)"); + printk(KERN_CONT "unusable"); break; default: printk(KERN_CONT "type %u", type); @@ -158,10 +160,10 @@ void __init e820_print_map(char *who) int i; for (i = 0; i < e820.nr_map; i++) { - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, (unsigned long long) e820.map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size)); + (e820.map[i].addr + e820.map[i].size - 1)); e820_print_type(e820.map[i].type); printk(KERN_CONT "\n"); } @@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", - (unsigned long long) start, - (unsigned long long) end); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); e820_print_type(old_type); printk(KERN_CONT " ==> "); e820_print_type(new_type); @@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", - (unsigned long long) start, - (unsigned long long) end); + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); if (checktype) e820_print_type(old_type); printk(KERN_CONT "\n"); @@ -567,7 +567,7 @@ void __init update_e820(void) if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) return; e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); + printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) @@ -637,8 +637,8 @@ __init void e820_setup_gap(void) if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; printk(KERN_ERR - "PCI: Warning: Cannot find a gap in the 32bit address range\n" - "PCI: Unassigned devices with 32bit resource registers may break!\n"); + "e820: cannot find a gap in the 32bit address range\n" + "e820: PCI devices with unassigned 32bit BARs may break!\n"); } #endif @@ -648,8 +648,8 @@ __init void e820_setup_gap(void) pci_mem_start = gapstart; printk(KERN_INFO - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + "e820: [mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); } /** @@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata) extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - printk(KERN_INFO "extended physical RAM map:\n"); + printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } @@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); update_e820_saved(); } @@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", + printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } @@ -888,7 +888,7 @@ void __init finish_e820_parsing(void) early_panic("Invalid user supplied memory map"); e820.nr_map = nr; - printk(KERN_INFO "user-defined physical RAM map:\n"); + printk(KERN_INFO "e820: user-defined physical RAM map:\n"); e820_print_map("user"); } } @@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void) end = MAX_RESOURCE_SIZE; if (start >= end) continue; - printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", - start, end); + printk(KERN_DEBUG + "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", + start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } @@ -1047,7 +1048,7 @@ void __init setup_memory_map(void) who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); } -- cgit v1.2.3 From 365811d6f9bd98543bedc02b72d94f0f0faf3670 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 29 May 2012 15:06:29 -0700 Subject: x86: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. For example: -found SMP MP-table at [ffff8800000fce90] fce90 +found SMP MP-table at [mem 0x000fce90-0x000fce9f] mapped at [ffff8800000fce90] -initial memory mapped : 0 - 20000000 +initial memory mapped: [mem 0x00000000-0x1fffffff] -Base memory trampoline at [ffff88000009c000] 9c000 size 8192 +Base memory trampoline [mem 0x0009c000-0x0009dfff] mapped at [ffff88000009c000] -SRAT: Node 0 PXM 0 0-80000000 +SRAT: Node 0 PXM 0 [mem 0x00000000-0x7fffffff] Signed-off-by: Bjorn Helgaas Cc: Yinghai Lu Cc: Konrad Rzeszutek Wilk Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/mpparse.c | 10 ++++++---- arch/x86/kernel/setup.c | 16 ++++++++-------- arch/x86/mm/init.c | 16 +++++++++------- arch/x86/mm/numa.c | 32 ++++++++++++++++---------------- arch/x86/mm/numa_emulation.c | 4 ++-- arch/x86/mm/pat.c | 42 +++++++++++++++++++----------------------- arch/x86/mm/srat.c | 5 +++-- 7 files changed, 63 insertions(+), 62 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b02d4dd6b8a3..fbca2e6223bf 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -568,8 +568,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) struct mpf_intel *mpf; unsigned long mem; - apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", - bp, length); + apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", + base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -584,8 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #endif mpf_found = mpf; - printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", - mpf, (u64)virt_to_phys(mpf)); + printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", + (unsigned long long) virt_to_phys(mpf), + (unsigned long long) virt_to_phys(mpf) + + sizeof(*mpf) - 1, mpf); mem = virt_to_phys(mpf); memblock_reserve(mem, sizeof(*mpf)); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f2afee6a19c1..982e44f960db 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -334,8 +334,8 @@ static void __init relocate_initrd(void) memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; - printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", - ramdisk_here, ramdisk_here + ramdisk_size); + printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", + ramdisk_here, ramdisk_here + ramdisk_size - 1); q = (char *)initrd_start; @@ -366,8 +366,8 @@ static void __init relocate_initrd(void) /* high pages is not converted by early_res_to_bootmem */ ramdisk_image = boot_params.hdr.ramdisk_image; ramdisk_size = boot_params.hdr.ramdisk_size; - printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" - " %08llx - %08llx\n", + printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" + " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } @@ -392,8 +392,8 @@ static void __init reserve_initrd(void) ramdisk_size, end_of_lowmem>>1); } - printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, - ramdisk_end); + printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, + ramdisk_end - 1); if (ramdisk_end <= end_of_lowmem) { @@ -906,8 +906,8 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif - printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", - max_pfn_mapped<> PAGE_SHIFT); - printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); + printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", + end - 1, pgt_buf_start << PAGE_SHIFT, + (pgt_buf_top << PAGE_SHIFT) - 1); } void __init native_pagetable_reserve(u64 start, u64 end) @@ -132,7 +133,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, int nr_range, i; int use_pse, use_gbpages; - printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end); + printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n", + start, end - 1); #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK) /* @@ -251,8 +253,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, } for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG " %010lx - %010lx page %s\n", - mr[i].start, mr[i].end, + printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n", + mr[i].start, mr[i].end - 1, (mr[i].page_size_mask & (1<> PAGE_SHIFT); #else /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 19d3fa08b119..2d125be1bae9 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -141,8 +141,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, u64 end, /* whine about and ignore invalid blks */ if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", - nid, start, end); + pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); return 0; } @@ -210,8 +210,8 @@ static void __init setup_node_data(int nid, u64 start, u64 end) start = roundup(start, ZONE_ALIGN); - printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", - nid, start, end); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); /* * Allocate node data. Try remap allocator first, node-local @@ -232,7 +232,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) } /* report and initialize */ - printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); if (!remapped && tnid != nid) @@ -291,14 +291,14 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) */ if (bi->end > bj->start && bi->start < bj->end) { if (bi->nid != bj->nid) { - pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->nid, bj->start, bj->end); + pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); return -EINVAL; } - pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", - bi->nid, bi->start, bi->end, - bj->start, bj->end); + pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); } /* @@ -320,9 +320,9 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) } if (k < mi->nr_blks) continue; - printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", - bi->nid, bi->start, bi->end, bj->start, bj->end, - start, end); + printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); bi->start = start; bi->end = end; numa_remove_memblk_from(j--, mi); @@ -616,8 +616,8 @@ static int __init dummy_numa_init(void) { printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", - 0LLU, PFN_PHYS(max_pfn)); + printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n", + 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 871dd8868170..dbbbb47260cc 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -68,8 +68,8 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, numa_remove_memblk_from(phys_blk, pi); } - printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, - eb->start, eb->end, (eb->end - eb->start) >> 20); + printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", + nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); return 0; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f6ff57b7efa5..f11729fd019c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -209,9 +209,8 @@ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, page = pfn_to_page(pfn); type = get_page_memtype(page); if (type != -1) { - printk(KERN_INFO "reserve_ram_pages_type failed " - "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", - start, end, type, req_type); + printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n", + start, end - 1, type, req_type); if (new_type) *new_type = type; @@ -314,9 +313,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, err = rbt_memtype_check_insert(new, new_type); if (err) { - printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " - "track %s, req %s\n", - start, end, cattr_name(new->type), cattr_name(req_type)); + printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", + start, end - 1, + cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); @@ -325,8 +324,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_unlock(&memtype_lock); - dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", - start, end, cattr_name(new->type), cattr_name(req_type), + dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n", + start, end - 1, cattr_name(new->type), cattr_name(req_type), new_type ? cattr_name(*new_type) : "-"); return err; @@ -360,14 +359,14 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (!entry) { - printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", - current->comm, current->pid, start, end); + printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", + current->comm, current->pid, start, end - 1); return -EINVAL; } kfree(entry); - dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1); return 0; } @@ -491,9 +490,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) while (cursor < to) { if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO - "Program %s tried to access /dev/mem between %Lx->%Lx.\n", - current->comm, from, to); + printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n", + current->comm, from, to - 1); return 0; } cursor += PAGE_SIZE; @@ -554,12 +552,11 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) size; if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) { - printk(KERN_INFO - "%s:%d ioremap_change_attr failed %s " - "for %Lx-%Lx\n", + printk(KERN_INFO "%s:%d ioremap_change_attr failed %s " + "for [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, cattr_name(flags), - base, (unsigned long long)(base + size)); + base, (unsigned long long)(base + size-1)); return -EINVAL; } return 0; @@ -591,12 +588,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags = lookup_memtype(paddr); if (want_flags != flags) { - printk(KERN_WARNING - "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); *vma_prot = __pgprot((pgprot_val(*vma_prot) & (~_PAGE_CACHE_MASK)) | @@ -614,11 +610,11 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, !is_new_memtype_allowed(paddr, size, want_flags, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d map pfn expected mapping type %s" - " for %Lx-%Lx, got %s\n", + " for [mem %#010Lx-%#010Lx], got %s\n", current->comm, current->pid, cattr_name(want_flags), (unsigned long long)paddr, - (unsigned long long)(paddr + size), + (unsigned long long)(paddr + size - 1), cattr_name(flags)); return -EINVAL; } diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index efb5b4b93711..732af3a96183 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -176,8 +176,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } - printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, - start, end); + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", + node, pxm, + (unsigned long long) start, (unsigned long long) end - 1); } void __init acpi_numa_arch_fixup(void) {} -- cgit v1.2.3 From 26c191788f18129af0eb32a358cdaea0c7479626 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 29 May 2012 15:06:49 -0700 Subject: mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race condition When holding the mmap_sem for reading, pmd_offset_map_lock should only run on a pmd_t that has been read atomically from the pmdp pointer, otherwise we may read only half of it leading to this crash. PID: 11679 TASK: f06e8000 CPU: 3 COMMAND: "do_race_2_panic" #0 [f06a9dd8] crash_kexec at c049b5ec #1 [f06a9e2c] oops_end at c083d1c2 #2 [f06a9e40] no_context at c0433ded #3 [f06a9e64] bad_area_nosemaphore at c043401a #4 [f06a9e6c] __do_page_fault at c0434493 #5 [f06a9eec] do_page_fault at c083eb45 #6 [f06a9f04] error_code (via page_fault) at c083c5d5 EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP: 00000000 DS: 007b ESI: 9e201000 ES: 007b EDI: 01fb4700 GS: 00e0 CS: 0060 EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246 #7 [f06a9f38] _spin_lock at c083bc14 #8 [f06a9f44] sys_mincore at c0507b7d #9 [f06a9fb0] system_call at c083becd start len EAX: ffffffda EBX: 9e200000 ECX: 00001000 EDX: 6228537f DS: 007b ESI: 00000000 ES: 007b EDI: 003d0f00 SS: 007b ESP: 62285354 EBP: 62285388 GS: 0033 CS: 0073 EIP: 00291416 ERR: 000000da EFLAGS: 00000286 This should be a longstanding bug affecting x86 32bit PAE without THP. Only archs with 64bit large pmd_t and 32bit unsigned long should be affected. With THP enabled the barrier() in pmd_none_or_trans_huge_or_clear_bad() would partly hide the bug when the pmd transition from none to stable, by forcing a re-read of the *pmd in pmd_offset_map_lock, but when THP is enabled a new set of problem arises by the fact could then transition freely in any of the none, pmd_trans_huge or pmd_trans_stable states. So making the barrier in pmd_none_or_trans_huge_or_clear_bad() unconditional isn't good idea and it would be a flakey solution. This should be fully fixed by introducing a pmd_read_atomic that reads the pmd in order with THP disabled, or by reading the pmd atomically with cmpxchg8b with THP enabled. Luckily this new race condition only triggers in the places that must already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix is localized there but this bug is not related to THP. NOTE: this can trigger on x86 32bit systems with PAE enabled with more than 4G of ram, otherwise the high part of the pmd will never risk to be truncated because it would be zero at all times, in turn so hiding the SMP race. This bug was discovered and fully debugged by Ulrich, quote: ---- [..] pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and eax. 496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) 497 { 498 /* depend on compiler for an atomic pmd read */ 499 pmd_t pmdval = *pmd; // edi = pmd pointer 0xc0507a74 : mov 0x8(%esp),%edi ... // edx = PTE page table high address 0xc0507a84 : mov 0x4(%edi),%edx ... // eax = PTE page table low address 0xc0507a8e : mov (%edi),%eax [..] Please note that the PMD is not read atomically. These are two "mov" instructions where the high order bits of the PMD entry are fetched first. Hence, the above machine code is prone to the following race. - The PMD entry {high|low} is 0x0000000000000000. The "mov" at 0xc0507a84 loads 0x00000000 into edx. - A page fault (on another CPU) sneaks in between the two "mov" instructions and instantiates the PMD. - The PMD entry {high|low} is now 0x00000003fda38067. The "mov" at 0xc0507a8e loads 0xfda38067 into eax. ---- Reported-by: Ulrich Obergfell Signed-off-by: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Cc: Larry Woodman Cc: Petr Matousek Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-3level.h | 50 +++++++++++++++++++++++++++++++++++ include/asm-generic/pgtable.h | 22 +++++++++++++-- 2 files changed, 70 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index effff47a3c82..43876f16caf1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) ptep->pte_low = pte.pte_low; } +#define pmd_read_atomic pmd_read_atomic +/* + * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with + * a "*pmdp" dereference done by gcc. Problem is, in certain places + * where pte_offset_map_lock is called, concurrent page faults are + * allowed, if the mmap_sem is hold for reading. An example is mincore + * vs page faults vs MADV_DONTNEED. On the page fault side + * pmd_populate rightfully does a set_64bit, but if we're reading the + * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen + * because gcc will not read the 64bit of the pmd atomically. To fix + * this all places running pmd_offset_map_lock() while holding the + * mmap_sem in read mode, shall read the pmdp pointer using this + * function to know if the pmd is null nor not, and in turn to know if + * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd + * operations. + * + * Without THP if the mmap_sem is hold for reading, the + * pmd can only transition from null to not null while pmd_read_atomic runs. + * So there's no need of literally reading it atomically. + * + * With THP if the mmap_sem is hold for reading, the pmd can become + * THP or null or point to a pte (and in turn become "stable") at any + * time under pmd_read_atomic, so it's mandatory to read it atomically + * with cmpxchg8b. + */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) +{ + pmdval_t ret; + u32 *tmp = (u32 *)pmdp; + + ret = (pmdval_t) (*tmp); + if (ret) { + /* + * If the low part is null, we must not read the high part + * or we can end up with a partial pmd. + */ + smp_rmb(); + ret |= ((pmdval_t)*(tmp + 1)) << 32; + } + + return (pmd_t) { ret }; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) +{ + return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index e2768f188f55..6f2b45a9b6bc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd) #endif /* __HAVE_ARCH_PMD_WRITE */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#ifndef pmd_read_atomic +static inline pmd_t pmd_read_atomic(pmd_t *pmdp) +{ + /* + * Depend on compiler for an atomic pmd read. NOTE: this is + * only going to work, if the pmdval_t isn't larger than + * an unsigned long. + */ + return *pmdp; +} +#endif + /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and @@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd) * undefined so behaving like if the pmd was none is safe (because it * can return none anyway). The compiler level barrier() is critically * important to compute the two checks atomically on the same pmdval. + * + * For 32bit kernels with a 64bit large pmd_t this automatically takes + * care of reading the pmd atomically to avoid SMP race conditions + * against pmd_populate() when the mmap_sem is hold for reading by the + * caller (a special atomic read not done by "gcc" as in the generic + * version above, is also needed when THP is disabled because the page + * fault can populate the pmd from under us). */ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) { - /* depend on compiler for an atomic pmd read */ - pmd_t pmdval = *pmd; + pmd_t pmdval = pmd_read_atomic(pmd); /* * The barrier will stabilize the pmdval in a register or on * the stack so that it will stop changing under the code. -- cgit v1.2.3 From fa83523f45fbb403eba4ebc5704bf98aa4da0163 Mon Sep 17 00:00:00 2001 From: John Dykstra Date: Fri, 25 May 2012 16:12:46 -0500 Subject: x86/mm/pat: Improve scaling of pat_pagerange_is_ram() Function pat_pagerange_is_ram() scales poorly to large address ranges, because it probes the resource tree for each page. On a 2.6 GHz Opteron, this function consumes 34 ms for a 1 GB range. It is called twice during untrack_pfn_vma(), slowing process cleanup and handicapping the OOM killer. This replacement consumes less than 1ms, under the same conditions. Signed-off-by: John Dykstra on behalf of Cray Inc. Acked-by: Suresh Siddha Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1337980366.1979.6.camel@redwood [ Small stylistic cleanups and renames ] Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 56 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f6ff57b7efa5..bea6e573e02b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -158,31 +158,47 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } +struct pagerange_state { + unsigned long cur_pfn; + int ram; + int not_ram; +}; + +static int +pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg) +{ + struct pagerange_state *state = arg; + + state->not_ram |= initial_pfn > state->cur_pfn; + state->ram |= total_nr_pages > 0; + state->cur_pfn = initial_pfn + total_nr_pages; + + return state->ram && state->not_ram; +} + static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { - int ram_page = 0, not_rampage = 0; - unsigned long page_nr; + int ret = 0; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct pagerange_state state = {start_pfn, 0, 0}; - for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); - ++page_nr) { - /* - * For legacy reasons, physical address range in the legacy ISA - * region is tracked as non-RAM. This will allow users of - * /dev/mem to map portions of legacy ISA region, even when - * some of those portions are listed(or not even listed) with - * different e820 types(RAM/reserved/..) - */ - if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) && - page_is_ram(page_nr)) - ram_page = 1; - else - not_rampage = 1; - - if (ram_page == not_rampage) - return -1; + /* + * For legacy reasons, physical address range in the legacy ISA + * region is tracked as non-RAM. This will allow users of + * /dev/mem to map portions of legacy ISA region, even when + * some of those portions are listed(or not even listed) with + * different e820 types(RAM/reserved/..) + */ + if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT) + start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT; + + if (start_pfn < end_pfn) { + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &state, pagerange_is_ram_callback); } - return ram_page; + return (ret > 0) ? -1 : (state.ram ? 1 : 0); } /* -- cgit v1.2.3 From 9f646389aa7727a2fd8f9ae6337b92af9cfbc264 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 May 2012 16:39:09 +0200 Subject: sched/x86: Use cpu_llc_shared_mask(cpu) for coregroup_mask Commit commit 8e7fbcbc2 ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs") made a boo-boo with removing the power aware scheduling muck from the x86 topology bits. We should unconditionally use the llc_shared mask for multi-core. Reported-and-tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Borislav Petkov Cc: Andreas Herrmann Link: http://lkml.kernel.org/n/tip-lsksc2kfyeveb13avh327p0d@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f56f96da77f5..fd019d78b1f4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -410,15 +410,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) /* maps the cpu to the sched domain representing multi-core */ const struct cpumask *cpu_coregroup_mask(int cpu) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - /* - * For perf, we return last level cache shared map. - * And for power savings, we return cpu_core_map - */ - if (!(cpu_has(c, X86_FEATURE_AMD_DCM))) - return cpu_core_mask(cpu); - else - return cpu_llc_shared_mask(cpu); + return cpu_llc_shared_mask(cpu); } static void impress_friends(void) -- cgit v1.2.3 From 319b6ffc6df892e4ccffff823cc5521a4a5d2dca Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 30 May 2012 12:33:41 +0300 Subject: x86, realmode: Unbreak the ia64 build of drivers/acpi/sleep.c Revert usage of acpi_wakeup_address and move definition to x86 architecture code in order to make compilation work in ia64. [jsakkine: tested compilation in ia64/x86-64 and added proper commit message] Reported-by: Paul Gortmaker Originally-by: H. Peter Anvin Signed-off-by: Jarkko Sakkinen Link: http://lkml.kernel.org/r/1338370421-27735-1-git-send-email-jarkko.sakkinen@intel.com Cc: Tony Luck Cc: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/acpi.h | 7 +++---- drivers/acpi/sleep.c | 8 ++------ 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 724aa441de7d..0c44630d1789 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -29,6 +29,7 @@ #include #include #include +#include #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -116,10 +117,8 @@ static inline void acpi_disable_pci(void) /* Low-level suspend routine. */ extern int acpi_suspend_lowlevel(void); -extern const unsigned char acpi_wakeup_code[]; - -/* early initialization routine */ -extern void acpi_reserve_wakeup_memory(void); +/* Physical address to resume after wakeup */ +#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start)) /* * Check if the CPU can handle C2 and deeper diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index ebaa04593236..74ee4ab577b6 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -25,8 +25,6 @@ #include #include -#include - #include "internal.h" #include "sleep.h" @@ -93,13 +91,11 @@ static struct notifier_block tts_notifier = { static int acpi_sleep_prepare(u32 acpi_state) { #ifdef CONFIG_ACPI_SLEEP - unsigned long wakeup_pa = real_mode_header->wakeup_start; /* do we have a wakeup address for S2 and S3? */ if (acpi_state == ACPI_STATE_S3) { - if (!wakeup_pa) + if (!acpi_wakeup_address) return -EFAULT; - acpi_set_firmware_waking_vector( - (acpi_physical_address)wakeup_pa); + acpi_set_firmware_waking_vector(acpi_wakeup_address); } ACPI_FLUSH_CPU_CACHE(); -- cgit v1.2.3 From 2da06af8106f8f35318bb084baf8448797ef058a Mon Sep 17 00:00:00 2001 From: "zhenzhong.duan" Date: Wed, 30 May 2012 12:52:15 +0800 Subject: x86, mtrr: Fix a type overflow in range_to_mtrr func When boot on sun G5+ with 4T mem, see an overflow in mtrr cleanup as below. *BAD*gran_size: 2G chunk_size: 2G num_reg: 10 lose cover RAM: -18014398505283592M This is because 1<<31 sign extended. Use an unsigned long constant to fix it. Useful for mem larger than or equal to 4T. -v2: Use 64bit constant instead of explicit type conversion as suggested by Yinghai. Description updated too. Signed-off-by: Zhenzhong Duan Link: http://lkml.kernel.org/r/4FC5A77F.6060505@oracle.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ac140c7be396..bdda2e6c673b 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -266,7 +266,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, if (align > max_align) align = max_align; - sizek = 1 << align; + sizek = 1UL << align; if (debug_print) { char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; -- cgit v1.2.3 From 82f7af09e6fb58fb725c850d725d5e8780a9bec2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 May 2012 17:54:51 +0000 Subject: x86/mce: Cleanup timer mess Use unsigned long for dealing with jiffies not int. Rename the callback to something sensible. Use __this_cpu_read/write for accessing per cpu data. Signed-off-by: Thomas Gleixner Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5f793e6c854b..98003bfc5556 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1243,15 +1243,15 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mce_start_timer(unsigned long data) +static void mce_timer_fn(unsigned long data) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long iv; WARN_ON(smp_processor_id() != data); @@ -1264,13 +1264,14 @@ static void mce_start_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(mce_next_interval); + iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - *n = max(*n/2, HZ/100); + iv = max(iv, (unsigned long) HZ/100); else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + __this_cpu_write(mce_next_interval, iv); - t->expires = jiffies + *n; + t->expires = jiffies + iv; add_timer_on(t, smp_processor_id()); } @@ -1511,17 +1512,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); + unsigned long iv = __this_cpu_read(mce_next_interval); - setup_timer(t, mce_start_timer, smp_processor_id()); + setup_timer(t, mce_timer_fn, smp_processor_id()); if (mce_ignore_ce) return; - *n = check_interval * HZ; - if (!*n) + __this_cpu_write(mce_next_interval, iv); + if (!iv) return; - t->expires = round_jiffies(jiffies + *n); + t->expires = round_jiffies(jiffies + iv); add_timer_on(t, smp_processor_id()); } @@ -2231,7 +2232,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED_FROZEN: if (!mce_ignore_ce && check_interval) { t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); + per_cpu(mce_next_interval, cpu)); add_timer_on(t, cpu); } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); -- cgit v1.2.3 From 1ab46fd319bcf1fcd9fb6311727d532b580e4eba Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 30 May 2012 18:23:56 -0400 Subject: x86, amd, xen: Avoid NULL pointer paravirt references Stub out MSR methods that aren't actually needed. This fixes a crash as Xen Dom0 on AMD Trinity systems. A bigger patch should be added to remove the paravirt machinery completely for the methods which apparently have no users! Reported-by: Andre Przywara Link: http://lkml.kernel.org/r/20120530222356.GA28417@andromeda.dapyr.net Signed-off-by: H. Peter Anvin Cc: --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 75f33b2a5933..e74df9548a02 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1116,7 +1116,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, + .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = xen_write_msr_safe, + .wrmsr_regs = native_wrmsr_safe_regs, + .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, -- cgit v1.2.3 From bb8ac181a5cf50458a0d83b4460790badc9fdc16 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 May 2012 10:25:23 -0400 Subject: bury __kernel_nlink_t, make internal nlink_t consistent Signed-off-by: Al Viro --- arch/alpha/include/asm/posix_types.h | 3 --- arch/arm/include/asm/posix_types.h | 3 --- arch/avr32/include/asm/posix_types.h | 3 --- arch/blackfin/include/asm/posix_types.h | 3 --- arch/cris/include/asm/posix_types.h | 3 --- arch/frv/include/asm/posix_types.h | 3 --- arch/h8300/include/asm/posix_types.h | 3 --- arch/ia64/include/asm/posix_types.h | 3 --- arch/m32r/include/asm/posix_types.h | 3 --- arch/m68k/include/asm/posix_types.h | 3 --- arch/mips/include/asm/posix_types.h | 5 ----- arch/mn10300/include/asm/posix_types.h | 3 --- arch/parisc/include/asm/posix_types.h | 3 --- arch/powerpc/include/asm/posix_types.h | 3 --- arch/s390/include/asm/posix_types.h | 3 --- arch/sh/include/asm/posix_types_32.h | 2 -- arch/sh/include/asm/posix_types_64.h | 2 -- arch/sparc/include/asm/posix_types.h | 5 ----- arch/tile/include/asm/compat.h | 1 - arch/x86/include/asm/posix_types_32.h | 3 --- include/asm-generic/posix_types.h | 4 ---- include/linux/types.h | 2 +- 22 files changed, 1 insertion(+), 65 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/posix_types.h b/arch/alpha/include/asm/posix_types.h index 24779fc95994..5a8a48320efe 100644 --- a/arch/alpha/include/asm/posix_types.h +++ b/arch/alpha/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned int __kernel_ino_t; #define __kernel_ino_t __kernel_ino_t -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ #include diff --git a/arch/arm/include/asm/posix_types.h b/arch/arm/include/asm/posix_types.h index efdf99045d87..d2de9cbbcd9b 100644 --- a/arch/arm/include/asm/posix_types.h +++ b/arch/arm/include/asm/posix_types.h @@ -22,9 +22,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/avr32/include/asm/posix_types.h b/arch/avr32/include/asm/posix_types.h index 74667bfc88cc..9ba9e749b3f3 100644 --- a/arch/avr32/include/asm/posix_types.h +++ b/arch/avr32/include/asm/posix_types.h @@ -17,9 +17,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/blackfin/include/asm/posix_types.h b/arch/blackfin/include/asm/posix_types.h index 41bc1875c4d7..1bd3436db6a7 100644 --- a/arch/blackfin/include/asm/posix_types.h +++ b/arch/blackfin/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned int __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/cris/include/asm/posix_types.h b/arch/cris/include/asm/posix_types.h index 234891c74e2b..ce4e51793151 100644 --- a/arch/cris/include/asm/posix_types.h +++ b/arch/cris/include/asm/posix_types.h @@ -15,9 +15,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/frv/include/asm/posix_types.h b/arch/frv/include/asm/posix_types.h index 3f34cb45fbb3..fe512af74a5a 100644 --- a/arch/frv/include/asm/posix_types.h +++ b/arch/frv/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/h8300/include/asm/posix_types.h b/arch/h8300/include/asm/posix_types.h index bc4c34efb1ad..91e62ba4c7b0 100644 --- a/arch/h8300/include/asm/posix_types.h +++ b/arch/h8300/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/ia64/include/asm/posix_types.h b/arch/ia64/include/asm/posix_types.h index 7323ab9467eb..99ee1d6510cf 100644 --- a/arch/ia64/include/asm/posix_types.h +++ b/arch/ia64/include/asm/posix_types.h @@ -1,9 +1,6 @@ #ifndef _ASM_IA64_POSIX_TYPES_H #define _ASM_IA64_POSIX_TYPES_H -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ #include diff --git a/arch/m32r/include/asm/posix_types.h b/arch/m32r/include/asm/posix_types.h index 0195850e1f88..236de26a409b 100644 --- a/arch/m32r/include/asm/posix_types.h +++ b/arch/m32r/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/m68k/include/asm/posix_types.h b/arch/m68k/include/asm/posix_types.h index 6373093be72b..cf4dbf70fdc7 100644 --- a/arch/m68k/include/asm/posix_types.h +++ b/arch/m68k/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/mips/include/asm/posix_types.h b/arch/mips/include/asm/posix_types.h index e0308dcca135..fa03ec3fbf89 100644 --- a/arch/mips/include/asm/posix_types.h +++ b/arch/mips/include/asm/posix_types.h @@ -17,11 +17,6 @@ * assume GCC is being used. */ -#if (_MIPS_SZLONG == 64) -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t -#endif - typedef long __kernel_daddr_t; #define __kernel_daddr_t __kernel_daddr_t diff --git a/arch/mn10300/include/asm/posix_types.h b/arch/mn10300/include/asm/posix_types.h index ab506181ec31..d31eeea480cf 100644 --- a/arch/mn10300/include/asm/posix_types.h +++ b/arch/mn10300/include/asm/posix_types.h @@ -20,9 +20,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/parisc/include/asm/posix_types.h b/arch/parisc/include/asm/posix_types.h index 5212b0357daf..b9344256f76b 100644 --- a/arch/parisc/include/asm/posix_types.h +++ b/arch/parisc/include/asm/posix_types.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/arch/powerpc/include/asm/posix_types.h b/arch/powerpc/include/asm/posix_types.h index f1393252bbda..2958c5b97b2d 100644 --- a/arch/powerpc/include/asm/posix_types.h +++ b/arch/powerpc/include/asm/posix_types.h @@ -16,9 +16,6 @@ typedef int __kernel_ssize_t; typedef long __kernel_ptrdiff_t; #define __kernel_size_t __kernel_size_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t #endif diff --git a/arch/s390/include/asm/posix_types.h b/arch/s390/include/asm/posix_types.h index edf8527ff08d..7be104c0f192 100644 --- a/arch/s390/include/asm/posix_types.h +++ b/arch/s390/include/asm/posix_types.h @@ -24,7 +24,6 @@ typedef unsigned short __kernel_old_dev_t; typedef unsigned long __kernel_ino_t; typedef unsigned short __kernel_mode_t; -typedef unsigned short __kernel_nlink_t; typedef unsigned short __kernel_ipc_pid_t; typedef unsigned short __kernel_uid_t; typedef unsigned short __kernel_gid_t; @@ -35,7 +34,6 @@ typedef int __kernel_ptrdiff_t; typedef unsigned int __kernel_ino_t; typedef unsigned int __kernel_mode_t; -typedef unsigned int __kernel_nlink_t; typedef int __kernel_ipc_pid_t; typedef unsigned int __kernel_uid_t; typedef unsigned int __kernel_gid_t; @@ -47,7 +45,6 @@ typedef unsigned long __kernel_sigset_t; /* at least 32 bits */ #define __kernel_ino_t __kernel_ino_t #define __kernel_mode_t __kernel_mode_t -#define __kernel_nlink_t __kernel_nlink_t #define __kernel_ipc_pid_t __kernel_ipc_pid_t #define __kernel_uid_t __kernel_uid_t #define __kernel_gid_t __kernel_gid_t diff --git a/arch/sh/include/asm/posix_types_32.h b/arch/sh/include/asm/posix_types_32.h index abda58467ece..ba0bdc423b07 100644 --- a/arch/sh/include/asm/posix_types_32.h +++ b/arch/sh/include/asm/posix_types_32.h @@ -3,8 +3,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t typedef unsigned short __kernel_uid_t; diff --git a/arch/sh/include/asm/posix_types_64.h b/arch/sh/include/asm/posix_types_64.h index fcda07b4a616..244f7e950e17 100644 --- a/arch/sh/include/asm/posix_types_64.h +++ b/arch/sh/include/asm/posix_types_64.h @@ -3,8 +3,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t typedef unsigned short __kernel_uid_t; diff --git a/arch/sparc/include/asm/posix_types.h b/arch/sparc/include/asm/posix_types.h index 3070f25ae90a..156220ed99eb 100644 --- a/arch/sparc/include/asm/posix_types.h +++ b/arch/sparc/include/asm/posix_types.h @@ -9,8 +9,6 @@ #if defined(__sparc__) && defined(__arch64__) /* sparc 64 bit */ -typedef unsigned int __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t typedef unsigned short __kernel_old_uid_t; typedef unsigned short __kernel_old_gid_t; @@ -38,9 +36,6 @@ typedef unsigned short __kernel_gid_t; typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef long __kernel_daddr_t; #define __kernel_daddr_t __kernel_daddr_t diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h index 69adc08d36a5..6e74450ff0a1 100644 --- a/arch/tile/include/asm/compat.h +++ b/arch/tile/include/asm/compat.h @@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t; typedef __kernel_mode_t compat_mode_t; typedef __kernel_dev_t compat_dev_t; typedef __kernel_loff_t compat_loff_t; -typedef __kernel_nlink_t compat_nlink_t; typedef __kernel_ipc_pid_t compat_ipc_pid_t; typedef __kernel_daddr_t compat_daddr_t; typedef __kernel_fsid_t compat_fsid_t; diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h index 99f262e04b91..8e525059e7d8 100644 --- a/arch/x86/include/asm/posix_types_32.h +++ b/arch/x86/include/asm/posix_types_32.h @@ -10,9 +10,6 @@ typedef unsigned short __kernel_mode_t; #define __kernel_mode_t __kernel_mode_t -typedef unsigned short __kernel_nlink_t; -#define __kernel_nlink_t __kernel_nlink_t - typedef unsigned short __kernel_ipc_pid_t; #define __kernel_ipc_pid_t __kernel_ipc_pid_t diff --git a/include/asm-generic/posix_types.h b/include/asm-generic/posix_types.h index 91d44bd4dde3..fe74fccf18db 100644 --- a/include/asm-generic/posix_types.h +++ b/include/asm-generic/posix_types.h @@ -23,10 +23,6 @@ typedef __kernel_ulong_t __kernel_ino_t; typedef unsigned int __kernel_mode_t; #endif -#ifndef __kernel_nlink_t -typedef __kernel_ulong_t __kernel_nlink_t; -#endif - #ifndef __kernel_pid_t typedef int __kernel_pid_t; #endif diff --git a/include/linux/types.h b/include/linux/types.h index 7f480db60231..9c1bd539ea70 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -25,7 +25,7 @@ typedef __kernel_dev_t dev_t; typedef __kernel_ino_t ino_t; typedef __kernel_mode_t mode_t; typedef unsigned short umode_t; -typedef __kernel_nlink_t nlink_t; +typedef __u32 nlink_t; typedef __kernel_off_t off_t; typedef __kernel_pid_t pid_t; typedef __kernel_daddr_t daddr_t; -- cgit v1.2.3 From d97b46a64674a267bc41c9e16132ee2a98c3347d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:44 -0700 Subject: syscalls, x86: add __NR_kcmp syscall While doing the checkpoint-restore in the user space one need to determine whether various kernel objects (like mm_struct-s of file_struct-s) are shared between tasks and restore this state. The 2nd step can be solved by using appropriate CLONE_ flags and the unshare syscall, while there's currently no ways for solving the 1st one. One of the ways for checking whether two tasks share e.g. mm_struct is to provide some mm_struct ID of a task to its proc file, but showing such info considered to be not that good for security reasons. Thus after some debates we end up in conclusion that using that named 'comparison' syscall might be the best candidate. So here is it -- __NR_kcmp. It takes up to 5 arguments - the pids of the two tasks (which characteristics should be compared), the comparison type and (in case of comparison of files) two file descriptors. Lookups for pids are done in the caller's PID namespace only. At moment only x86 is supported and tested. [akpm@linux-foundation.org: fix up selftests, warnings] [akpm@linux-foundation.org: include errno.h] [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Cyrill Gorcunov Acked-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Glauber Costa Cc: Andi Kleen Cc: Tejun Heo Cc: Matt Helsley Cc: Pekka Enberg Cc: Eric Dumazet Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Valdis.Kletnieks@vt.edu Cc: Michal Marek Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 2 + include/linux/kcmp.h | 17 +++ include/linux/syscalls.h | 2 + kernel/Makefile | 3 + kernel/kcmp.c | 196 +++++++++++++++++++++++++++++++ kernel/sys_ni.c | 3 + tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/kcmp/Makefile | 29 +++++ tools/testing/selftests/kcmp/kcmp_test.c | 94 +++++++++++++++ 10 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 include/linux/kcmp.h create mode 100644 kernel/kcmp.c create mode 100644 tools/testing/selftests/kcmp/Makefile create mode 100644 tools/testing/selftests/kcmp/kcmp_test.c (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 29f9f0554f7d..7a35a6e71d44 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -355,3 +355,4 @@ 346 i386 setns sys_setns 347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index dd29a9ea27c5..51171aeff0dc 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -318,6 +318,8 @@ 309 common getcpu sys_getcpu 310 64 process_vm_readv sys_process_vm_readv 311 64 process_vm_writev sys_process_vm_writev +312 64 kcmp sys_kcmp + # # x32-specific system call numbers start at 512 to avoid cache impact # for native 64-bit operation. diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h new file mode 100644 index 000000000000..2dcd1b3aafc8 --- /dev/null +++ b/include/linux/kcmp.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_KCMP_H +#define _LINUX_KCMP_H + +/* Comparison type */ +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + + KCMP_TYPES, +}; + +#endif /* _LINUX_KCMP_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3de3acb84a95..19439c75c5b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid, unsigned long riovcnt, unsigned long flags); +asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, + unsigned long idx1, unsigned long idx2); #endif diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b7..80be6ca0cc75 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -25,6 +25,9 @@ endif obj-y += sched/ obj-y += power/ +ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) +obj-$(CONFIG_X86) += kcmp.o +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * We don't expose the real in-memory order of objects for security reasons. + * But still the comparison results should be suitable for sorting. So we + * obfuscate kernel pointers values and compare the production instead. + * + * The obfuscation is done in two steps. First we xor the kernel pointer with + * a random value, which puts pointer into a new position in a reordered space. + * Secondly we multiply the xor production with a large odd random number to + * permute its bits even more (the odd multiplier guarantees that the product + * is unique ever after the high bits are truncated, since any odd number is + * relative prime to 2^n). + * + * Note also that the obfuscation itself is invisible to userspace and if needed + * it can be changed to an alternate scheme. + */ +static unsigned long cookies[KCMP_TYPES][2] __read_mostly; + +static long kptr_obfuscate(long v, int type) +{ + return (v ^ cookies[type][0]) * cookies[type][1]; +} + +/* + * 0 - equal, i.e. v1 = v2 + * 1 - less than, i.e. v1 < v2 + * 2 - greater than, i.e. v1 > v2 + * 3 - not equal but ordering unavailable (reserved for future) + */ +static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) +{ + long ret; + + ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + + return (ret < 0) | ((ret > 0) << 1); +} + +/* The caller must have pinned the task */ +static struct file * +get_file_raw_ptr(struct task_struct *task, unsigned int idx) +{ + struct file *file = NULL; + + task_lock(task); + rcu_read_lock(); + + if (task->files) + file = fcheck_files(task->files, idx); + + rcu_read_unlock(); + task_unlock(task); + + return file; +} + +static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +{ + if (likely(m2 != m1)) + mutex_unlock(m2); + mutex_unlock(m1); +} + +static int kcmp_lock(struct mutex *m1, struct mutex *m2) +{ + int err; + + if (m2 > m1) + swap(m1, m2); + + err = mutex_lock_killable(m1); + if (!err && likely(m1 != m2)) { + err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + if (err) + mutex_unlock(m1); + } + + return err; +} + +SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, + unsigned long, idx1, unsigned long, idx2) +{ + struct task_struct *task1, *task2; + int ret; + + rcu_read_lock(); + + /* + * Tasks are looked up in caller's PID namespace only. + */ + task1 = find_task_by_vpid(pid1); + task2 = find_task_by_vpid(pid2); + if (!task1 || !task2) + goto err_no_task; + + get_task_struct(task1); + get_task_struct(task2); + + rcu_read_unlock(); + + /* + * One should have enough rights to inspect task details. + */ + ret = kcmp_lock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); + if (ret) + goto err; + if (!ptrace_may_access(task1, PTRACE_MODE_READ) || + !ptrace_may_access(task2, PTRACE_MODE_READ)) { + ret = -EPERM; + goto err_unlock; + } + + switch (type) { + case KCMP_FILE: { + struct file *filp1, *filp2; + + filp1 = get_file_raw_ptr(task1, idx1); + filp2 = get_file_raw_ptr(task2, idx2); + + if (filp1 && filp2) + ret = kcmp_ptr(filp1, filp2, KCMP_FILE); + else + ret = -EBADF; + break; + } + case KCMP_VM: + ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); + break; + case KCMP_FILES: + ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); + break; + case KCMP_FS: + ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); + break; + case KCMP_SIGHAND: + ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); + break; + case KCMP_IO: + ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); + break; + case KCMP_SYSVSEM: +#ifdef CONFIG_SYSVIPC + ret = kcmp_ptr(task1->sysvsem.undo_list, + task2->sysvsem.undo_list, + KCMP_SYSVSEM); +#else + ret = -EOPNOTSUPP; +#endif + break; + default: + ret = -EINVAL; + break; + } + +err_unlock: + kcmp_unlock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); +err: + put_task_struct(task1); + put_task_struct(task2); + + return ret; + +err_no_task: + rcu_read_unlock(); + return -ESRCH; +} + +static __init int kcmp_cookies_init(void) +{ + int i; + + get_random_bytes(cookies, sizeof(cookies)); + + for (i = 0; i < KCMP_TYPES; i++) + cookies[i][1] |= (~(~0UL >> 1) | 1); + + return 0; +} +arch_initcall(kcmp_cookies_init); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); cond_syscall(sys_name_to_handle_at); cond_syscall(sys_open_by_handle_at); cond_syscall(compat_sys_open_by_handle_at); + +/* compare kernel pointers */ +cond_syscall(sys_kcmp); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 14972017a43e..a4162e15c25f 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = breakpoints mqueue vm +TARGETS = breakpoints kcmp mqueue vm all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile new file mode 100644 index 000000000000..dc79b86ea65c --- /dev/null +++ b/tools/testing/selftests/kcmp/Makefile @@ -0,0 +1,29 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := X86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ +endif +ifeq ($(ARCH),x86_64) + ARCH := X86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ +endif + +CFLAGS += -I../../../../arch/x86/include/generated/ +CFLAGS += -I../../../../include/ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../arch/x86/include/ + +all: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) kcmp_test.c -o run_test +else + echo "Not an x86 target, can't build kcmp selftest" +endif + +run-tests: all + ./kcmp_test + +clean: + rm -fr ./run_test + rm -fr ./test-file diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c new file mode 100644 index 000000000000..358cc6bfa35d --- /dev/null +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -0,0 +1,94 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2) +{ + return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2); +} + +int main(int argc, char **argv) +{ + const char kpath[] = "kcmp-test-file"; + int pid1, pid2; + int fd1, fd2; + int status; + + fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644); + pid1 = getpid(); + + if (fd1 < 0) { + perror("Can't create file"); + exit(1); + } + + pid2 = fork(); + if (pid2 < 0) { + perror("fork failed"); + exit(1); + } + + if (!pid2) { + int pid2 = getpid(); + int ret; + + fd2 = open(kpath, O_RDWR, 0644); + if (fd2 < 0) { + perror("Can't open file"); + exit(1); + } + + /* An example of output and arguments */ + printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld " + "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld " + "INV: %2ld\n", + pid1, pid2, + sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd2), + sys_kcmp(pid1, pid2, KCMP_FILES, 0, 0), + sys_kcmp(pid1, pid2, KCMP_VM, 0, 0), + sys_kcmp(pid1, pid2, KCMP_FS, 0, 0), + sys_kcmp(pid1, pid2, KCMP_SIGHAND, 0, 0), + sys_kcmp(pid1, pid2, KCMP_IO, 0, 0), + sys_kcmp(pid1, pid2, KCMP_SYSVSEM, 0, 0), + + /* This one should fail */ + sys_kcmp(pid1, pid2, KCMP_TYPES + 1, 0, 0)); + + /* This one should return same fd */ + ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1); + if (ret) { + printf("FAIL: 0 expected but %d returned\n", ret); + ret = -1; + } else + printf("PASS: 0 returned as expected\n"); + + /* Compare with self */ + ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0); + if (ret) { + printf("FAIL: 0 expected but %li returned\n", ret); + ret = -1; + } else + printf("PASS: 0 returned as expected\n"); + + exit(ret); + } + + waitpid(pid2, &status, P_ALL); + + return 0; +} -- cgit v1.2.3 From a192cd0413b71c2a3e4e48dd365af704be72b748 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 13:26:37 -0400 Subject: ftrace: Synchronize variable setting with breakpoints When the function tracer starts modifying the code via breakpoints it sets a variable (modifying_ftrace_code) to inform the breakpoint handler to call the ftrace int3 code. But there's no synchronization between setting this code and the handler, thus it is possible for the handler to be called on another CPU before it sees the variable. This will cause a kernel crash as the int3 handler will not know what to do with it. I originally added smp_mb()'s to force the visibility of the variable but H. Peter Anvin suggested that I just make it atomic. [ Added comments as suggested by Peter Zijlstra ] Suggested-by: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 2 +- arch/x86/kernel/ftrace.c | 38 +++++++++++++++++++++++++++++++++++--- arch/x86/kernel/traps.c | 8 ++++++-- 3 files changed, 42 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 18d9005d9e4f..b0767bc08740 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,7 +34,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); -extern int modifying_ftrace_code; +extern atomic_t modifying_ftrace_code; static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 32ff36596ab1..2407a6d81cb7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -168,7 +168,38 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } -int modifying_ftrace_code __read_mostly; +/* + * The modifying_ftrace_code is used to tell the breakpoint + * handler to call ftrace_int3_handler(). If it fails to + * call this handler for a breakpoint added by ftrace, then + * the kernel may crash. + * + * As atomic_writes on x86 do not need a barrier, we do not + * need to add smp_mb()s for this to work. It is also considered + * that we can not read the modifying_ftrace_code before + * executing the breakpoint. That would be quite remarkable if + * it could do that. Here's the flow that is required: + * + * CPU-0 CPU-1 + * + * atomic_inc(mfc); + * write int3s + * // implicit (r)mb + * if (atomic_read(mfc)) + * call ftrace_int3_handler() + * + * Then when we are finished: + * + * atomic_dec(mfc); + * + * If we hit a breakpoint that was not set by ftrace, it does not + * matter if ftrace_int3_handler() is called or not. It will + * simply be ignored. But it is crucial that a ftrace nop/caller + * breakpoint is handled. No other user should ever place a + * breakpoint on an ftrace nop/caller location. It must only + * be done by this code. + */ +atomic_t modifying_ftrace_code __read_mostly; /* * A breakpoint was added to the code address we are about to @@ -491,11 +522,12 @@ void ftrace_replace_code(int enable) void arch_ftrace_update_code(int command) { - modifying_ftrace_code++; + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); ftrace_modify_all_code(command); - modifying_ftrace_code--; + atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void *data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff08457a025d..05b31d92f69c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -303,8 +303,12 @@ gp_in_kernel: dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_DYNAMIC_FTRACE - /* ftrace must be first, everything else may cause a recursive crash */ - if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + /* + * ftrace must be first, everything else may cause a recursive crash. + * See note by declaration of modifying_ftrace_code in ftrace.c + */ + if (unlikely(atomic_read(&modifying_ftrace_code)) && + ftrace_int3_handler(regs)) return; #endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP -- cgit v1.2.3 From 8a4d0a687a599f39b7df3fe15f2d51d2157caf44 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 13:36:38 -0400 Subject: ftrace: Use breakpoint method to update ftrace caller On boot up and module load, it is fine to modify the code directly, without the use of breakpoints. This is because boot up modification is done before SMP is initialized, thus the modification is serial, and module load is done before the module executes. But after that we must use a SMP safe method to modify running code. Otherwise, if we are running the function tracer and update its function (by starting off the stack tracer, or perf tracing) the change of the function called by the ftrace trampoline is done directly. If this is being executed on another CPU, that CPU may take a GPF and crash the kernel. The breakpoint method is used to change the nops at all the functions, but the change of the ftrace callback handler itself was still using a direct modification. If tracing was enabled and the function callback was changed then another CPU could fault if it was currently calling the original callback. This modification must use the breakpoint method too. Note, the direct method is still used for boot up and module load. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 88 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 72 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2407a6d81cb7..c3a7cb4bf6e6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -100,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void) } static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, +ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -141,7 +141,20 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); - return ftrace_modify_code(rec->ip, old, new); + /* + * On boot up, and when modules are loaded, the MCOUNT_ADDR + * is converted to a nop, and will never become MCOUNT_ADDR + * again. This code is either running before SMP (on boot up) + * or before the code will ever be executed (module load). + * We do not want to use the breakpoint version in this case, + * just modify the code directly. + */ + if (addr == MCOUNT_ADDR) + return ftrace_modify_code_direct(rec->ip, old, new); + + /* Normal cases use add_brk_on_nop */ + WARN_ONCE(1, "invalid use of ftrace_make_nop"); + return -EINVAL; } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) @@ -152,20 +165,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); - return ftrace_modify_code(rec->ip, old, new); -} - -int ftrace_update_ftrace_func(ftrace_func_t func) -{ - unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[MCOUNT_INSN_SIZE], *new; - int ret; - - memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, (unsigned long)func); - ret = ftrace_modify_code(ip, old, new); - - return ret; + /* Should only be called when module is loaded */ + return ftrace_modify_code_direct(rec->ip, old, new); } /* @@ -201,6 +202,29 @@ int ftrace_update_ftrace_func(ftrace_func_t func) */ atomic_t modifying_ftrace_code __read_mostly; +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code); + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[MCOUNT_INSN_SIZE], *new; + int ret; + + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); + + ret = ftrace_modify_code(ip, old, new); + + atomic_dec(&modifying_ftrace_code); + + return ret; +} + /* * A breakpoint was added to the code address we are about to * modify, and this is the handle that will just skip over it. @@ -520,6 +544,38 @@ void ftrace_replace_code(int enable) } } +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + int ret; + + ret = add_break(ip, old_code); + if (ret) + goto out; + + run_sync(); + + ret = add_update_code(ip, new_code); + if (ret) + goto fail_update; + + run_sync(); + + ret = ftrace_write(ip, new_code, 1); + if (ret) { + ret = -EPERM; + goto out; + } + run_sync(); + out: + return ret; + + fail_update: + probe_kernel_write((void *)ip, &old_code[0], 1); + goto out; +} + void arch_ftrace_update_code(int command) { /* See comment above by declaration of modifying_ftrace_code */ -- cgit v1.2.3 From c0525a6972d3f1fb83058ef503e183475d6e4e26 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 11:43:19 -0400 Subject: x86: Reset the debug_stack update counter When an NMI goes off and it sees that it preempted the debug stack, to keep the debug stack safe, it changes the IDT to point to one that does not modify the stack on breakpoint (to allow breakpoints in NMIs). But the variable that gets set to know to undo it on exit never gets cleared on exit. Thus every NMI will reset it on exit the first time it is done even if it does not need to be reset. [ Added H. Peter Anvin's suggestion to use this_cpu_read/write ] Cc: # v3.3 Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 90875279ef3d..a0b2f84457be 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -444,14 +444,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs) */ if (unlikely(is_debug_stack(regs->sp))) { debug_stack_set_zero(); - __get_cpu_var(update_debug_stack) = 1; + this_cpu_write(update_debug_stack, 1); } } static inline void nmi_nesting_postprocess(void) { - if (unlikely(__get_cpu_var(update_debug_stack))) + if (unlikely(this_cpu_read(update_debug_stack))) { debug_stack_reset(); + this_cpu_write(update_debug_stack, 0); + } } #endif -- cgit v1.2.3 From f8988175fd70874d1fb3712b1c5d3bfc6d455202 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 11:47:00 -0400 Subject: x86: Allow nesting of the debug stack IDT setting When the NMI handler runs, it checks if it preempted a debug handler and if that handler is using the debug stack. If it is, it changes the IDT table not to update the stack, otherwise it will reset the debug stack and corrupt the debug handler it preempted. Now that ftrace uses breakpoints to change functions from nops to callers, many more places may hit a breakpoint. Unfortunately this includes some of the calls that lockdep performs. Which causes issues with the debug stack. It too needs to change the debug stack before tracing (if called from the debug handler). Allow the debug_stack_set_zero() and debug_stack_reset() to be nested so that the debug handlers can take advantage of them too. [ Used this_cpu_*() over __get_cpu_var() as suggested by H. Peter Anvin ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/cpu/common.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 82f29e70d058..6b9333b429ba 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr) addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } +static DEFINE_PER_CPU(u32, debug_stack_use_ctr); + void debug_stack_set_zero(void) { + this_cpu_inc(debug_stack_use_ctr); load_idt((const struct desc_ptr *)&nmi_idt_descr); } void debug_stack_reset(void) { - load_idt((const struct desc_ptr *)&idt_descr); + if (WARN_ON(!this_cpu_read(debug_stack_use_ctr))) + return; + if (this_cpu_dec_return(debug_stack_use_ctr) == 0) + load_idt((const struct desc_ptr *)&idt_descr); } #else /* CONFIG_X86_64 */ -- cgit v1.2.3 From 5963e317b1e9d2a4511503916d8fd664bb8fa8fb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 11:54:53 -0400 Subject: ftrace/x86: Do not change stacks in DEBUG when calling lockdep When both DYNAMIC_FTRACE and LOCKDEP are set, the TRACE_IRQS_ON/OFF will call into the lockdep code. The lockdep code can call lots of functions that may be traced by ftrace. When ftrace is updating its code and hits a breakpoint, the breakpoint handler will call into lockdep. If lockdep happens to call a function that also has a breakpoint attached, it will jump back into the breakpoint handler resetting the stack to the debug stack and corrupt the contents currently on that stack. The 'do_sym' call that calls do_int3() is protected by modifying the IST table to point to a different location if another breakpoint is hit. But the TRACE_IRQS_OFF/ON are outside that protection, and if a breakpoint is hit from those, the stack will get corrupted, and the kernel will crash: [ 1013.243754] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002 [ 1013.272665] IP: [] 0xffff880145cbffff [ 1013.285186] PGD 1401b2067 PUD 14324c067 PMD 0 [ 1013.298832] Oops: 0010 [#1] PREEMPT SMP [ 1013.310600] CPU 2 [ 1013.317904] Modules linked in: ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables crc32c_intel ghash_clmulni_intel microcode usb_debug serio_raw pcspkr iTCO_wdt i2c_i801 iTCO_vendor_support e1000e nfsd nfs_acl auth_rpcgss lockd sunrpc i915 video i2c_algo_bit drm_kms_helper drm i2c_core [last unloaded: scsi_wait_scan] [ 1013.401848] [ 1013.407399] Pid: 112, comm: kworker/2:1 Not tainted 3.4.0+ #30 [ 1013.437943] RIP: 8eb8:[] [] 0xffff880146309fff [ 1013.459871] RSP: ffffffff8165e919:ffff88014780f408 EFLAGS: 00010046 [ 1013.477909] RAX: 0000000000000001 RBX: ffffffff81104020 RCX: 0000000000000000 [ 1013.499458] RDX: ffff880148008ea8 RSI: ffffffff8131ef40 RDI: ffffffff82203b20 [ 1013.521612] RBP: ffffffff81005751 R08: 0000000000000000 R09: 0000000000000000 [ 1013.543121] R10: ffffffff82cdc318 R11: 0000000000000000 R12: ffff880145cc0000 [ 1013.564614] R13: ffff880148008eb8 R14: 0000000000000002 R15: ffff88014780cb40 [ 1013.586108] FS: 0000000000000000(0000) GS:ffff880148000000(0000) knlGS:0000000000000000 [ 1013.609458] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1013.627420] CR2: 0000000000000002 CR3: 0000000141f10000 CR4: 00000000001407e0 [ 1013.649051] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1013.670724] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1013.692376] Process kworker/2:1 (pid: 112, threadinfo ffff88013fe0e000, task ffff88014020a6a0) [ 1013.717028] Stack: [ 1013.724131] ffff88014780f570 ffff880145cc0000 0000400000004000 0000000000000000 [ 1013.745918] cccccccccccccccc ffff88014780cca8 ffffffff811072bb ffffffff81651627 [ 1013.767870] ffffffff8118f8a7 ffffffff811072bb ffffffff81f2b6c5 ffffffff81f11bdb [ 1013.790021] Call Trace: [ 1013.800701] Code: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a d7 64 81 ff ff ff ff 01 00 00 00 00 00 00 00 65 d9 64 81 ff [ 1013.861443] RIP [] 0xffff880146309fff [ 1013.884466] RSP [ 1013.901507] CR2: 0000000000000002 The solution was to reuse the NMI functions that change the IDT table to make the debug stack keep its current stack (in kernel mode) when hitting a breakpoint: call debug_stack_set_zero TRACE_IRQS_ON call debug_stack_reset If the TRACE_IRQS_ON happens to hit a breakpoint then it will keep the current stack and not crash the box. Reported-by: Dave Jones Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 320852d02026..7d65133b51be 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -190,6 +190,44 @@ ENDPROC(native_usergs_sysret64) #endif .endm +/* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + /* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based @@ -1098,7 +1136,7 @@ ENTRY(\sym) subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) @@ -1393,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore testl $3,CS(%rsp) @@ -1404,7 +1442,7 @@ paranoid_swapgs: RESTORE_ALL 8 jmp irq_return paranoid_restore: - TRACE_IRQS_IRETQ 0 + TRACE_IRQS_IRETQ_DEBUG 0 RESTORE_ALL 8 jmp irq_return paranoid_userspace: -- cgit v1.2.3 From 30dc0d0fe5d08396dbdaa2d70972149131340960 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 15 Mar 2012 19:13:25 +0000 Subject: x86, efi: Only close open files in error path The loop at the 'close_handles' label in handle_ramdisks() should be using 'i', which represents the number of initrd files that were successfully opened, not 'nr_initrds' which is the number of initrd= arguments passed on the command line. Currently, if we execute the loop to close all file handles and we failed to open any initrds we'll try to call the close function on a garbage pointer, causing the machine to hang. Cc: Matthew Garrett Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1331907517-3985-2-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 2c14e76bb4c7..52a4e667b258 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -674,7 +674,7 @@ free_initrd_total: low_free(initrd_total, initrd_addr); close_handles: - for (k = j; k < nr_initrds; k++) + for (k = j; k < i; k++) efi_call_phys1(fh->close, initrds[k].handle); free_initrds: efi_call_phys1(sys_table->boottime->free_pool, initrds); -- cgit v1.2.3 From 9fa7dedad3d30345c843bd82db02c4d6169e5f61 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 20 Feb 2012 13:20:59 +0000 Subject: x86, efi; Add EFI boot stub console support We need a way of printing useful messages to the user, for example when we fail to open an initrd file, instead of just hanging the machine without giving the user any indication of what went wrong. So sprinkle some error messages throughout the EFI boot stub code to make it easier for users to diagnose/report problems. Reported-by: Keshav P R Cc: Matthew Garrett Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1331907517-3985-3-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 85 ++++++++++++++++++++++++++++++++-------- arch/x86/boot/compressed/eboot.h | 6 +++ 2 files changed, 75 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 52a4e667b258..4e85f5f85837 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -16,6 +16,26 @@ static efi_system_table_t *sys_table; +static void efi_printk(char *str) +{ + char *s8; + + for (s8 = str; *s8; s8++) { + struct efi_simple_text_output_protocol *out; + efi_char16_t ch[2] = { 0 }; + + ch[0] = *s8; + out = (struct efi_simple_text_output_protocol *)sys_table->con_out; + + if (*s8 == '\n') { + efi_char16_t nl[2] = { '\r', 0 }; + efi_call_phys2(out->output_string, out, nl); + } + + efi_call_phys2(out->output_string, out, ch); + } +} + static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, unsigned long *desc_size) { @@ -531,8 +551,10 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, EFI_LOADER_DATA, nr_initrds * sizeof(*initrds), &initrds); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for initrds\n"); goto fail; + } str = (char *)(unsigned long)hdr->cmd_line_ptr; for (i = 0; i < nr_initrds; i++) { @@ -575,32 +597,42 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, status = efi_call_phys3(boottime->handle_protocol, image->device_handle, &fs_proto, &io); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to handle fs_proto\n"); goto free_initrds; + } status = efi_call_phys2(io->open_volume, io, &fh); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to open volume\n"); goto free_initrds; + } } status = efi_call_phys5(fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to open initrd file\n"); goto close_handles; + } initrd->handle = h; info_sz = 0; status = efi_call_phys4(h->get_info, h, &info_guid, &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) + if (status != EFI_BUFFER_TOO_SMALL) { + efi_printk("Failed to get initrd info size\n"); goto close_handles; + } grow: status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, info_sz, &info); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for initrd info\n"); goto close_handles; + } status = efi_call_phys4(h->get_info, h, &info_guid, &info_sz, info); @@ -612,8 +644,10 @@ grow: file_sz = info->file_size; efi_call_phys1(sys_table->boottime->free_pool, info); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to get initrd info\n"); goto close_handles; + } initrd->size = file_sz; initrd_total += file_sz; @@ -629,11 +663,14 @@ grow: */ status = high_alloc(initrd_total, 0x1000, &initrd_addr, hdr->initrd_addr_max); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc highmem for initrds\n"); goto close_handles; + } /* We've run out of free low memory. */ if (initrd_addr > hdr->initrd_addr_max) { + efi_printk("We've run out of free low memory\n"); status = EFI_INVALID_PARAMETER; goto free_initrd_total; } @@ -652,8 +689,10 @@ grow: status = efi_call_phys3(fh->read, initrds[j].handle, &chunksize, addr); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to read initrd\n"); goto free_initrd_total; + } addr += chunksize; size -= chunksize; } @@ -732,8 +771,10 @@ static efi_status_t make_boot_params(struct boot_params *boot_params, options_size++; /* NUL termination */ status = low_alloc(options_size, 1, &cmdline); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for cmdline\n"); goto fail; + } s1 = (u8 *)(unsigned long)cmdline; s2 = (u16 *)options; @@ -895,12 +936,16 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) status = efi_call_phys3(sys_table->boottime->handle_protocol, handle, &proto, (void *)&image); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); goto fail; + } status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc lowmem for boot params\n"); goto fail; + } memset(boot_params, 0x0, 0x4000); @@ -933,8 +978,10 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) if (status != EFI_SUCCESS) { status = low_alloc(hdr->init_size, hdr->kernel_alignment, &start); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for kernel\n"); goto fail; + } } hdr->code32_start = (__u32)start; @@ -945,19 +992,25 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for gdt structure\n"); goto fail; + } gdt->size = 0x800; status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for gdt\n"); goto fail; + } status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*idt), (void **)&idt); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for idt structure\n"); goto fail; + } idt->size = 0; idt->address = 0; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 39251663e65b..3b6e15627c55 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -58,4 +58,10 @@ struct efi_uga_draw_protocol { void *blt; }; +struct efi_simple_text_output_protocol { + void *reset; + void *output_string; + void *test_string; +}; + #endif /* BOOT_COMPRESSED_EBOOT_H */ -- cgit v1.2.3 From 0c7596621e313bfcfbacb288e768c7150f5de9e0 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 16 Mar 2012 12:03:13 +0000 Subject: x86, efi: Add EFI boot stub documentation Since we can't expect every user to read the EFI boot stub code it seems prudent to have a couple of paragraphs explaining what it is and how it works. The "initrd=" option in particular is tricky because it only understands absolute EFI-style paths (backslashes as directory separators), and until now this hasn't been documented anywhere. This has tripped up a couple of users. Cc: Matthew Garrett Cc: Randy Dunlap Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1331907517-3985-4-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- Documentation/x86/efi-stub.txt | 65 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/Kconfig | 2 ++ 2 files changed, 67 insertions(+) create mode 100644 Documentation/x86/efi-stub.txt (limited to 'arch/x86') diff --git a/Documentation/x86/efi-stub.txt b/Documentation/x86/efi-stub.txt new file mode 100644 index 000000000000..44e6bb6ead10 --- /dev/null +++ b/Documentation/x86/efi-stub.txt @@ -0,0 +1,65 @@ + The EFI Boot Stub + --------------------------- + +On the x86 platform, a bzImage can masquerade as a PE/COFF image, +thereby convincing EFI firmware loaders to load it as an EFI +executable. The code that modifies the bzImage header, along with the +EFI-specific entry point that the firmware loader jumps to are +collectively known as the "EFI boot stub", and live in +arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, +respectively. + +By using the EFI boot stub it's possible to boot a Linux kernel +without the use of a conventional EFI boot loader, such as grub or +elilo. Since the EFI boot stub performs the jobs of a boot loader, in +a certain sense it *IS* the boot loader. + +The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. + + +**** How to install bzImage.efi + +The bzImage located in arch/x86/boot/bzImage must be copied to the EFI +System Partiion (ESP) and renamed with the extension ".efi". Without +the extension the EFI firmware loader will refuse to execute it. It's +not possible to execute bzImage.efi from the usual Linux file systems +because EFI firmware doesn't have support for them. + + +**** Passing kernel parameters from the EFI shell + +Arguments to the kernel can be passed after bzImage.efi, e.g. + + fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 + + +**** The "initrd=" option + +Like most boot loaders, the EFI stub allows the user to specify +multiple initrd files using the "initrd=" option. This is the only EFI +stub-specific command line parameter, everything else is passed to the +kernel when it boots. + +The path to the initrd file must be an absolute path from the +beginning of the ESP, relative path names do not work. Also, the path +is an EFI-style path and directory elements must be separated with +backslashes (\). For example, given the following directory layout, + +fs0:> + Kernels\ + bzImage.efi + initrd-large.img + + Ramdisks\ + initrd-small.img + initrd-medium.img + +to boot with the initrd-large.img file if the current working +directory is fs0:\Kernels, the following command must be used, + + fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img + +Notice how bzImage.efi can be specified with a relative path. That's +because the image we're executing is interpreted by the EFI shell, +which understands relative paths, whereas the rest of the command line +is passed to bzImage.efi. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d700811785ea..c70684f859e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1506,6 +1506,8 @@ config EFI_STUB This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. + See Documentation/x86/efi-stub.txt for more information. + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" -- cgit v1.2.3 From 4ebefe3ec729003443daf153ed6fad1739271283 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 26 Apr 2012 22:29:20 -0400 Subject: new helpers: {clear,test,test_and_clear}_restore_sigmask() helpers parallel to set_restore_sigmask(), used in the next commits Signed-off-by: Al Viro --- arch/ia64/include/asm/thread_info.h | 16 ++++++++++++++++ arch/microblaze/include/asm/thread_info.h | 16 ++++++++++++++++ arch/powerpc/include/asm/thread_info.h | 16 ++++++++++++++++ arch/sh/include/asm/thread_info.h | 17 +++++++++++++++++ arch/sparc/include/asm/thread_info_64.h | 16 ++++++++++++++++ arch/tile/include/asm/thread_info.h | 16 ++++++++++++++++ arch/x86/include/asm/thread_info.h | 16 ++++++++++++++++ include/linux/thread_info.h | 12 ++++++++++++ 8 files changed, 125 insertions(+) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 310d9734f02d..8d600363fa57 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -143,6 +143,22 @@ static inline void set_restore_sigmask(void) ti->status |= TS_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, &ti->flags); } +static inline void clear_restore_sigmask(void) +{ + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; +} +static inline bool test_restore_sigmask(void) +{ + return current_thread_info()->status & TS_RESTORE_SIGMASK; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + if (!(ti->status & TS_RESTORE_SIGMASK)) + return false; + ti->status &= ~TS_RESTORE_SIGMASK; + return true; +} #endif /* !__ASSEMBLY__ */ #endif /* _ASM_IA64_THREAD_INFO_H */ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 1a8ab6a5c03f..12e39206b3ef 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -168,6 +168,22 @@ static inline void set_restore_sigmask(void) ti->status |= TS_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); } +static inline void clear_restore_sigmask(void) +{ + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; +} +static inline bool test_restore_sigmask(void) +{ + return current_thread_info()->status & TS_RESTORE_SIGMASK; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + if (!(ti->status & TS_RESTORE_SIGMASK)) + return false; + ti->status &= ~TS_RESTORE_SIGMASK; + return true; +} #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index a556ccc16b58..85d50a93a92f 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -142,6 +142,22 @@ static inline void set_restore_sigmask(void) ti->local_flags |= _TLF_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, &ti->flags); } +static inline void clear_restore_sigmask(void) +{ + current_thread_info()->local_flags &= ~_TLF_RESTORE_SIGMASK; +} +static inline bool test_restore_sigmask(void) +{ + return current_thread_info()->local_flags & _TLF_RESTORE_SIGMASK; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + if (!(ti->local_flags & _TLF_RESTORE_SIGMASK)) + return false; + ti->local_flags &= ~_TLF_RESTORE_SIGMASK; + return true; +} static inline bool test_thread_local_flags(unsigned int flags) { diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index 0c04ffc4f12c..a109157c6b8f 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -189,6 +189,23 @@ static inline unsigned int get_thread_fault_code(void) struct thread_info *ti = current_thread_info(); return ti->flags >> TI_FLAG_FAULT_CODE_SHIFT; } + +static inline void clear_restore_sigmask(void) +{ + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; +} +static inline bool test_restore_sigmask(void) +{ + return current_thread_info()->status & TS_RESTORE_SIGMASK; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + if (!(ti->status & TS_RESTORE_SIGMASK)) + return false; + ti->status &= ~TS_RESTORE_SIGMASK; + return true; +} #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 7f0981b09451..cb9b7a9f5fc1 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -240,6 +240,22 @@ static inline void set_restore_sigmask(void) ti->status |= TS_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, &ti->flags); } +static inline void clear_restore_sigmask(void) +{ + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; +} +static inline bool test_restore_sigmask(void) +{ + return current_thread_info()->status & TS_RESTORE_SIGMASK; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + if (!(ti->status & TS_RESTORE_SIGMASK)) + return false; + ti->status &= ~TS_RESTORE_SIGMASK; + return true; +} #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 656c486e64fa..5aef371921e4 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -168,6 +168,22 @@ static inline void set_restore_sigmask(void) ti->status |= TS_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, &ti->flags); } +static inline void clear_restore_sigmask(void) +{ + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; +} +static inline bool test_restore_sigmask(void) +{ + return current_thread_info()->status & TS_RESTORE_SIGMASK; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + if (!(ti->status & TS_RESTORE_SIGMASK)) + return false; + ti->status &= ~TS_RESTORE_SIGMASK; + return true; +} #endif /* !__ASSEMBLY__ */ #endif /* _ASM_TILE_THREAD_INFO_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 5c25de07cba8..8f3f1ff69fa9 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -250,6 +250,22 @@ static inline void set_restore_sigmask(void) ti->status |= TS_RESTORE_SIGMASK; set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); } +static inline void clear_restore_sigmask(void) +{ + current_thread_info()->status &= ~TS_RESTORE_SIGMASK; +} +static inline bool test_restore_sigmask(void) +{ + return current_thread_info()->status & TS_RESTORE_SIGMASK; +} +static inline bool test_and_clear_restore_sigmask(void) +{ + struct thread_info *ti = current_thread_info(); + if (!(ti->status & TS_RESTORE_SIGMASK)) + return false; + ti->status &= ~TS_RESTORE_SIGMASK; + return true; +} static inline bool is_ia32_task(void) { diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index eee729428683..ed279701ac79 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -127,6 +127,18 @@ static inline void set_restore_sigmask(void) set_thread_flag(TIF_RESTORE_SIGMASK); set_thread_flag(TIF_SIGPENDING); } +static inline void clear_restore_sigmask(void) +{ + clear_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_restore_sigmask(void) +{ + return test_thread_flag(TIF_RESTORE_SIGMASK); +} +static inline bool test_and_clear_restore_sigmask(void) +{ + return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); +} #endif /* TIF_RESTORE_SIGMASK && !HAVE_SET_RESTORE_SIGMASK */ #ifndef HAVE_SET_RESTORE_SIGMASK -- cgit v1.2.3 From 51a7b448d4134e3e8eec633435e3e8faee14a828 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 May 2012 23:33:55 -0400 Subject: new helper: restore_saved_sigmask() first fruits of ..._restore_sigmask() helpers: now we can take boilerplate "signal didn't have a handler, clear RESTORE_SIGMASK and restore the blocked mask from ->saved_mask" into a common helper. Open-coded instances switched... Signed-off-by: Al Viro --- arch/alpha/kernel/signal.c | 4 +--- arch/arm/kernel/signal.c | 6 +----- arch/avr32/kernel/signal.c | 5 +---- arch/blackfin/kernel/signal.c | 5 +---- arch/c6x/kernel/signal.c | 5 +---- arch/cris/arch-v10/kernel/signal.c | 5 +---- arch/cris/arch-v32/kernel/signal.c | 5 +---- arch/frv/kernel/signal.c | 6 +----- arch/h8300/kernel/signal.c | 3 +-- arch/hexagon/kernel/signal.c | 5 +---- arch/ia64/kernel/signal.c | 5 +---- arch/m32r/kernel/signal.c | 5 +---- arch/m68k/kernel/signal.c | 5 +---- arch/microblaze/kernel/signal.c | 5 +---- arch/mips/kernel/signal.c | 5 +---- arch/mn10300/kernel/signal.c | 5 +---- arch/openrisc/kernel/signal.c | 6 +----- arch/parisc/kernel/signal.c | 7 +------ arch/powerpc/kernel/signal.c | 6 +----- arch/s390/kernel/signal.c | 5 +---- arch/score/kernel/signal.c | 5 +---- arch/sh/kernel/signal_32.c | 5 +---- arch/sh/kernel/signal_64.c | 7 +------ arch/sparc/kernel/signal32.c | 5 +---- arch/sparc/kernel/signal_32.c | 5 +---- arch/sparc/kernel/signal_64.c | 5 +---- arch/tile/kernel/signal.c | 5 +---- arch/um/kernel/signal.c | 6 ++---- arch/unicore32/kernel/signal.c | 3 +-- arch/x86/kernel/signal.c | 5 +---- arch/xtensa/kernel/signal.c | 3 +-- include/linux/sched.h | 6 ++++++ 32 files changed, 38 insertions(+), 125 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index f6db3032ddf0..cadf4571ca31 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -572,9 +572,7 @@ do_signal(struct pt_regs * regs, struct switch_stack * sw, } /* If there's no signal to deliver, we just restore the saved mask. */ - if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK)) - set_current_blocked(¤t->saved_sigmask); - + restore_saved_sigmask(); if (single_stepping) ptrace_set_bpt(current); /* re-set breakpoint */ } diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 63f327dd5198..3d1daac8ea04 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -663,11 +663,7 @@ static void do_signal(struct pt_regs *regs, int syscall) set_thread_flag(TIF_SYSCALL_RESTARTSYS); } - /* If there's no signal to deliver, we just put the saved sigmask - * back. - */ - if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK)) - set_current_blocked(¤t->saved_sigmask); + restore_saved_sigmask(); } asmlinkage void diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index e7595ef74f51..8b12c3046137 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -297,10 +297,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall) if (signr == 0) { /* No signal to deliver -- put the saved sigmask back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); return 0; } diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index fc9ecce8b6ce..9d692a1277b3 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -319,10 +319,7 @@ asmlinkage void do_signal(struct pt_regs *regs) /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } /* diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index 9493f0bbf0a6..bfbcc958bbb4 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -343,10 +343,7 @@ static void do_signal(struct pt_regs *regs, int syscall) /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } /* diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index e16f8f297f61..06885e94e455 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -525,8 +525,5 @@ void do_signal(int canrestart, struct pt_regs *regs) /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index b338d8fc0c12..fe12cdca0bac 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -560,10 +560,7 @@ do_signal(int canrestart, struct pt_regs *regs) /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } asmlinkage void diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 595bf1e5a5dc..16351cc8c36c 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -536,11 +536,7 @@ no_signal: /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } - + restore_saved_sigmask(); } /* end do_signal() */ /*****************************************************************************/ diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index e58992ad789e..63623dabab32 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -501,8 +501,7 @@ statis void do_signal(struct pt_regs *regs) } /* If there's no signal to deliver, we just restore the saved mask. */ - if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK)) - set_current_blocked(¤t->saved_sigmask); + restore_saved_sigmask(); } asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index 21a3018cb9bf..acd6272913b3 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -259,10 +259,7 @@ no_signal: no_restart: /* If there's no signal to deliver, put the saved sigmask back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 7523501d3bc0..39d8f3afff49 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -538,8 +538,5 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 64804f1f5141..2ad7c4587669 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -360,10 +360,7 @@ static void do_signal(struct pt_regs *regs) prev_insn(regs); } } - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } /* diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 973eec60cad4..685cbe84f33f 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1182,10 +1182,7 @@ static void do_signal(struct pt_regs *regs) handle_restart(regs, NULL, 0); /* If there's no signal to deliver, we just restore the saved mask. */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } void do_notify_resume(struct pt_regs *regs) diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 5d796e32786e..8e644dfaba4f 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -381,10 +381,7 @@ static void do_signal(struct pt_regs *regs, int in_syscall) * If there's no signal to deliver, we just put the saved sigmask * back. */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } void do_notify_resume(struct pt_regs *regs, int in_syscall) diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 8a6e6d116ab0..aad2d2da5eec 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -614,10 +614,7 @@ static void do_signal(struct pt_regs *regs) * If there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } /* diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index b8b6aa1a6837..b7994c38eacc 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -525,10 +525,7 @@ static void do_signal(struct pt_regs *regs) /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } /* diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 9ae611522953..266c6fd2eb5c 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -339,11 +339,7 @@ void do_signal(struct pt_regs *regs) if (signr <= 0) { /* no signal to deliver so we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } - + restore_saved_sigmask(); } else { /* signr > 0 */ sigset_t *oldset; diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index e7a7cd3e1120..277cacadf653 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -621,12 +621,7 @@ do_signal(struct pt_regs *regs, long in_syscall) DBG(1,"do_signal: Exit (not delivered), regs->gr[28] = %ld\n", regs->gr[28]); - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } - - return; + restore_saved_sigmask(); } void do_notify_resume(struct pt_regs *regs, long in_syscall) diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index bfc3ec1382fb..0f4cc67f4268 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -132,12 +132,8 @@ static int do_signal(struct pt_regs *regs) check_syscall_restart(regs, &ka, signr > 0); if (signr <= 0) { - struct thread_info *ti = current_thread_info(); /* No signal to deliver -- put the saved sigmask back */ - if (ti->local_flags & _TLF_RESTORE_SIGMASK) { - ti->local_flags &= ~_TLF_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); regs->trap = 0; return 0; /* no signals delivered */ } diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 42a6e8b47f06..37799089c38e 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -484,10 +484,7 @@ void do_signal(struct pt_regs *regs) /* * If there's no signal to deliver, we just put the saved sigmask back. */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } void do_notify_resume(struct pt_regs *regs) diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index 302838d3acf6..9e751559375b 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -337,10 +337,7 @@ static void do_signal(struct pt_regs *regs) * If there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } /* diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 9d7bfd66f189..92f4173ad29a 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -610,10 +610,7 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0) * If there's no signal to deliver, we just put the saved sigmask * back. */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); } asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0, diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index aa6428430842..6e191ef0aa62 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -143,12 +143,7 @@ static void do_signal(struct pt_regs *regs) } /* No signal to deliver -- put the saved sigmask back */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } - - return; + restore_saved_sigmask(); } /* diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index bb1513e45f1a..88e0d8122d2c 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -872,10 +872,7 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs) /* If there's no signal to deliver, we just put the saved sigmask * back */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - set_current_blocked(¤t->saved_sigmask); - } + restore_saved_sigmask(); } struct sigstack32 { diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 6b42e8622d12..9dd97d2e171e 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -576,10 +576,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) /* if there's no signal to deliver, we just put the saved sigmask * back */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - set_current_blocked(¤t->saved_sigmask); - } + restore_saved_sigmask(); } void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index c82cf1cc3965..55b820ee0ac9 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -594,10 +594,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) /* If there's no signal to deliver, we just put the saved sigmask * back */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - set_current_blocked(¤t->saved_sigmask); - } + restore_saved_sigmask(); } void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags) diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index f79d4b88c747..62b3493ea77d 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -350,10 +350,7 @@ void do_signal(struct pt_regs *regs) } /* If there's no signal to deliver, just put the saved sigmask back. */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + restore_saved_sigmask(); done: /* Avoid double syscall restart if there are nested signals. */ diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 292e706016c5..6acf13c1740b 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -130,10 +130,8 @@ static int kern_do_signal(struct pt_regs *regs) * if there's no signal to deliver, we just put the saved sigmask * back */ - if (!handled_sig && test_thread_flag(TIF_RESTORE_SIGMASK)) { - clear_thread_flag(TIF_RESTORE_SIGMASK); - sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); - } + if (!handled_sig) + restore_saved_sigmask(); return handled_sig; } diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index 28782ad47b93..65a5ed3b6f2a 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -451,8 +451,7 @@ static void do_signal(struct pt_regs *regs, int syscall) /* If there's no signal to deliver, we just put the saved * sigmask back. */ - if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK)) - set_current_blocked(¤t->saved_sigmask); + restore_saved_sigmask(); } asmlinkage void do_notify_resume(struct pt_regs *regs, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2e937a5ad531..25a4a81a51aa 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -796,10 +796,7 @@ static void do_signal(struct pt_regs *regs) * If there's no signal to deliver, we just put the saved sigmask * back. */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) { - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - set_current_blocked(¤t->saved_sigmask); - } + restore_saved_sigmask(); } /* diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index ea7e17778a75..8c4e751e3b83 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -532,8 +532,7 @@ no_signal: } /* If there's no signal to deliver, we just restore the saved mask. */ - if (test_and_clear_thread_flag(TIF_RESTORE_SIGMASK)) - set_current_blocked(¤t->saved_sigmask); + restore_saved_sigmask(); if (current->ptrace & PT_SINGLESTEP) task_pt_regs(current)->icountlevel = 1; diff --git a/include/linux/sched.h b/include/linux/sched.h index 660c8ae93471..f1b46b88f6f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2207,6 +2207,12 @@ extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); +static inline void restore_saved_sigmask(void) +{ + if (test_and_clear_restore_sigmask()) + set_current_blocked(¤t->saved_sigmask); +} + static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); -- cgit v1.2.3 From b7f9a11a6cf1ea9ee6be3eb2b90d91327a09ad14 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 May 2012 09:59:21 -0400 Subject: new helper: sigmask_to_save() replace boilerplate "should we use ->saved_sigmask or ->blocked?" with calls of obvious inlined helper... Signed-off-by: Al Viro --- arch/alpha/kernel/signal.c | 5 +---- arch/arm/kernel/signal.c | 12 +++--------- arch/avr32/kernel/signal.c | 20 +++++++------------- arch/blackfin/kernel/signal.c | 12 +++--------- arch/c6x/kernel/signal.c | 14 +++----------- arch/cris/arch-v10/kernel/signal.c | 11 +++-------- arch/cris/arch-v32/kernel/signal.c | 11 +++-------- arch/frv/kernel/signal.c | 10 +++------- arch/h8300/kernel/signal.c | 11 +++-------- arch/hexagon/kernel/signal.c | 13 +++---------- arch/ia64/kernel/signal.c | 12 +++--------- arch/m32r/kernel/signal.c | 12 +++--------- arch/m68k/kernel/signal.c | 11 +++-------- arch/microblaze/kernel/signal.c | 9 ++------- arch/mips/kernel/signal.c | 11 +++-------- arch/mn10300/kernel/signal.c | 11 +++-------- arch/openrisc/kernel/signal.c | 12 +++--------- arch/parisc/kernel/signal.c | 21 +++++---------------- arch/powerpc/kernel/signal.c | 7 +------ arch/s390/kernel/signal.c | 7 +------ arch/score/kernel/signal.c | 12 +++--------- arch/sh/kernel/signal_32.c | 11 +++-------- arch/sh/kernel/signal_64.c | 13 ++++--------- arch/sparc/kernel/signal_32.c | 11 +++-------- arch/sparc/kernel/signal_64.c | 7 +------ arch/tile/kernel/signal.c | 11 +++-------- arch/um/kernel/signal.c | 11 +++-------- arch/unicore32/kernel/signal.c | 13 +++---------- arch/x86/kernel/signal.c | 5 +---- arch/xtensa/kernel/signal.c | 8 +------- include/linux/sched.h | 8 ++++++++ 31 files changed, 92 insertions(+), 250 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index cadf4571ca31..f1e7d2aa2586 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -468,12 +468,9 @@ static inline void handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs * regs, struct switch_stack *sw) { - sigset_t *oldset = ¤t->blocked; + sigset_t *oldset = sigmask_to_save(); int ret; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - if (ka->sa.sa_flags & SA_SIGINFO) ret = setup_rt_frame(sig, ka, info, oldset, regs, sw); else diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 3d1daac8ea04..2e66c93973c3 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -530,11 +530,11 @@ setup_rt_frame(int usig, struct k_sigaction *ka, siginfo_t *info, */ static int handle_signal(unsigned long sig, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, - struct pt_regs * regs) + siginfo_t *info, struct pt_regs *regs) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = current; + sigset_t *oldset = sigmask_to_save(); int usig = sig; int ret; @@ -617,8 +617,6 @@ static void do_signal(struct pt_regs *regs, int syscall) */ signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - sigset_t *oldset; - /* * Depending on the signal settings we may need to revert the * decision to restart the system call. But skip this if a @@ -635,11 +633,7 @@ static void do_signal(struct pt_regs *regs, int syscall) clear_thread_flag(TIF_SYSCALL_RESTARTSYS); } - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - if (handle_signal(signr, &ka, &info, oldset, regs) == 0) { + if (handle_signal(signr, &ka, &info, regs) == 0) { /* * A signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 8b12c3046137..0e2c0527c9fe 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -224,14 +224,14 @@ static inline void setup_syscall_restart(struct pt_regs *regs) static inline void handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *oldset, struct pt_regs *regs, int syscall) + struct pt_regs *regs, int syscall) { int ret; /* * Set up the stack frame */ - ret = setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); /* * Check that the resulting registers are sane @@ -255,7 +255,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, * doesn't want to handle. Thus you cannot kill init even with a * SIGKILL even by mistake. */ -int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall) +static void do_signal(struct pt_regs *regs, int syscall) { siginfo_t info; int signr; @@ -267,12 +267,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall) * without doing anything if so. */ if (!user_mode(regs)) - return 0; - - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else if (!oldset) - oldset = ¤t->blocked; + return; signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (syscall) { @@ -298,11 +293,10 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset, int syscall) if (signr == 0) { /* No signal to deliver -- put the saved sigmask back */ restore_saved_sigmask(); - return 0; + return; } - handle_signal(signr, &ka, &info, oldset, regs, syscall); - return 1; + handle_signal(signr, &ka, &info, regs, syscall); } asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti) @@ -313,7 +307,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti) syscall = 1; if (ti->flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) - do_signal(regs, ¤t->blocked, syscall); + do_signal(regs, syscall); if (ti->flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index 9d692a1277b3..7f4205ddfa4d 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -249,7 +249,7 @@ handle_restart(struct pt_regs *regs, struct k_sigaction *ka, int has_handler) */ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { int ret; @@ -259,7 +259,7 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, handle_restart(regs, ka, 1); /* set up the stack frame */ - ret = setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); if (ret == 0) block_sigmask(ka, sig); @@ -281,22 +281,16 @@ asmlinkage void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; current->thread.esp0 = (unsigned long)regs; if (try_to_freeze()) goto no_signal; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + if (handle_signal(signr, &info, &ka, regs) == 0) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index bfbcc958bbb4..38bb501eb117 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -250,8 +250,7 @@ do_restart: */ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs, - int syscall) + struct pt_regs *regs, int syscall) { int ret; @@ -278,7 +277,7 @@ static int handle_signal(int sig, } /* Set up the stack frame */ - ret = setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); if (ret == 0) block_sigmask(ka, sig); @@ -292,7 +291,6 @@ static void do_signal(struct pt_regs *regs, int syscall) { struct k_sigaction ka; siginfo_t info; - sigset_t *oldset; int signr; /* we want the common case to go fast, which is why we may in certain @@ -300,15 +298,9 @@ static void do_signal(struct pt_regs *regs, int syscall) if (!user_mode(regs)) return; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - if (handle_signal(signr, &info, &ka, oldset, - regs, syscall) == 0) { + if (handle_signal(signr, &info, &ka, regs, syscall) == 0) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index 06885e94e455..09a4cf4eb08a 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -417,8 +417,9 @@ give_sigsegv: static inline int handle_signal(int canrestart, unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Are we from a system call? */ @@ -478,7 +479,6 @@ void do_signal(int canrestart, struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; /* * We want the common case to go fast, which @@ -489,16 +489,11 @@ void do_signal(int canrestart, struct pt_regs *regs) if (!user_mode(regs)) return; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ if (handle_signal(canrestart, signr, &info, &ka, - oldset, regs)) { + regs)) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index fe12cdca0bac..d52276ddae4b 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -437,8 +437,9 @@ give_sigsegv: static inline int handle_signal(int canrestart, unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs * regs) + struct pt_regs * regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Check if this got called from a system call. */ @@ -511,7 +512,6 @@ do_signal(int canrestart, struct pt_regs *regs) int signr; siginfo_t info; struct k_sigaction ka; - sigset_t *oldset; /* * The common case should go fast, which is why this point is @@ -521,17 +521,12 @@ do_signal(int canrestart, struct pt_regs *regs) if (!user_mode(regs)) return; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ if (handle_signal(canrestart, signr, &info, &ka, - oldset, regs)) { + regs)) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 16351cc8c36c..22efe8d25038 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -427,8 +427,9 @@ give_sigsegv: * OK, we're invoking a handler */ static int handle_signal(unsigned long sig, siginfo_t *info, - struct k_sigaction *ka, sigset_t *oldset) + struct k_sigaction *ka) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Are we from a system call? */ @@ -492,14 +493,9 @@ static void do_signal(void) if (try_to_freeze()) goto no_signal; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, __frame, NULL); if (signr > 0) { - if (handle_signal(signr, &info, &ka, oldset) == 0) { + if (handle_signal(signr, &info, &ka) == 0) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 63623dabab32..d4d2f72672ad 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -412,8 +412,9 @@ give_sigsegv: */ static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs * regs) + struct pt_regs * regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* are we from a system call? */ if (regs->orig_er0 >= 0) { @@ -457,7 +458,6 @@ statis void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; /* * We want the common case to go fast, which @@ -473,15 +473,10 @@ statis void do_signal(struct pt_regs *regs) current->thread.esp0 = (unsigned long) regs; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); + handle_signal(signr, &info, &ka, regs); return; } no_signal: diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index acd6272913b3..f73fcee09bac 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -150,7 +150,7 @@ sigsegv: * Setup invocation of signal handler */ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { int rc; @@ -186,7 +186,7 @@ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, * Set up the stack frame; not doing the SA_SIGINFO thing. We * only set up the rt_frame flavor. */ - rc = setup_rt_frame(sig, ka, info, oldset, regs); + rc = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); /* If there was an error on setup, no signal was delivered. */ if (rc) @@ -215,14 +215,7 @@ static void do_signal(struct pt_regs *regs) signo = get_signal_to_deliver(&info, &sigact, regs, NULL); if (signo > 0) { - sigset_t *oldset; - - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - - if (handle_signal(signo, &info, &sigact, oldset, regs) == 0) { + if (handle_signal(signo, &info, &sigact, regs) == 0) { /* * Successful delivery case. The saved sigmask is * stored in the signal frame, and will be restored diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 39d8f3afff49..9fee6d6a3f21 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -415,10 +415,10 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, } static long -handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, +handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, struct sigscratch *scr) { - if (!setup_frame(sig, ka, info, oldset, scr)) + if (!setup_frame(sig, ka, info, sigmask_to_save(), scr)) return 0; block_sigmask(ka, sig); @@ -440,7 +440,6 @@ void ia64_do_signal (struct sigscratch *scr, long in_syscall) { struct k_sigaction ka; - sigset_t *oldset; siginfo_t info; long restart = in_syscall; long errno = scr->pt.r8; @@ -453,11 +452,6 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) if (!user_mode(&scr->pt)) return; - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - /* * This only loops in the rare cases of handle_signal() failing, in which case we * need to push through a forced SIGSEGV. @@ -507,7 +501,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) * Whee! Actually deliver the signal. If the delivery failed, we need to * continue to iterate in this loop so we can deliver the SIGSEGV... */ - if (handle_signal(signr, &ka, &info, oldset, scr)) { + if (handle_signal(signr, &ka, &info, scr)) { /* * A signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 2ad7c4587669..e0d6d1079f33 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -269,7 +269,7 @@ static int prev_insn(struct pt_regs *regs) static int handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { /* Are we from a system call? */ if (regs->syscall_nr >= 0) { @@ -294,7 +294,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, } /* Set up the stack frame */ - if (setup_rt_frame(sig, ka, info, oldset, regs)) + if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs)) return -EFAULT; block_sigmask(ka, sig); @@ -311,7 +311,6 @@ static void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; /* * We want the common case to go fast, which @@ -325,11 +324,6 @@ static void do_signal(struct pt_regs *regs) if (try_to_freeze()) goto no_signal; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Re-enable any watchpoints before delivering the @@ -339,7 +333,7 @@ static void do_signal(struct pt_regs *regs) */ /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &ka, &info, oldset, regs) == 0) + if (handle_signal(signr, &ka, &info, regs) == 0) clear_thread_flag(TIF_RESTORE_SIGMASK); return; diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 685cbe84f33f..c83eb5a8ed8b 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1123,8 +1123,9 @@ handle_restart(struct pt_regs *regs, struct k_sigaction *ka, int has_handler) */ static void handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int err; /* are we from a system call? */ if (regs->orig_d0 >= 0) @@ -1160,19 +1161,13 @@ static void do_signal(struct pt_regs *regs) siginfo_t info; struct k_sigaction ka; int signr; - sigset_t *oldset; current->thread.esp0 = (unsigned long) regs; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - handle_signal(signr, &ka, &info, oldset, regs); + handle_signal(signr, &ka, &info, regs); return; } diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 8e644dfaba4f..fd2de5718a4e 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -312,8 +312,9 @@ do_restart: static int handle_signal(unsigned long sig, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) + siginfo_t *info, struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Set up the stack frame */ @@ -344,18 +345,12 @@ static void do_signal(struct pt_regs *regs, int in_syscall) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; #ifdef DEBUG_SIG printk(KERN_INFO "do signal: %p %d\n", regs, in_syscall); printk(KERN_INFO "do signal2: %lx %lx %ld [%lx]\n", regs->pc, regs->r1, regs->r12, current_thread_info()->flags); #endif - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index aad2d2da5eec..18355060f241 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -515,8 +515,9 @@ struct mips_abi mips_abi = { }; static int handle_signal(unsigned long sig, siginfo_t *info, - struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) + struct k_sigaction *ka, struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int ret; struct mips_abi *abi = current->thread.abi; void *vdso = current->mm->context.vdso; @@ -560,7 +561,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; - sigset_t *oldset; siginfo_t info; int signr; @@ -572,15 +572,10 @@ static void do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + if (handle_signal(signr, &info, &ka, regs) == 0) { /* * A signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index b7994c38eacc..26a1d98c62a1 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -430,8 +430,9 @@ static inline void stepback(struct pt_regs *regs) */ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Are we from a system call? */ @@ -475,7 +476,6 @@ static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; siginfo_t info; - sigset_t *oldset; int signr; /* we want the common case to go fast, which is why we may in certain @@ -483,14 +483,9 @@ static void do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + if (handle_signal(signr, &info, &ka, regs) == 0) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 266c6fd2eb5c..721c584ff44a 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -254,11 +254,11 @@ give_sigsegv: static inline int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) + struct pt_regs *regs) { int ret; - ret = setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); if (ret) return ret; @@ -341,15 +341,9 @@ void do_signal(struct pt_regs *regs) * back */ restore_saved_sigmask(); } else { /* signr > 0 */ - sigset_t *oldset; - - if (current_thread_info()->flags & _TIF_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; /* Whee! Actually deliver the signal. */ - if (!handle_signal(signr, &info, &ka, oldset, regs)) { + if (!handle_signal(signr, &info, &ka, regs)) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 277cacadf653..441b25992846 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -443,8 +443,9 @@ give_sigsegv: static long handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs, int in_syscall) + struct pt_regs *regs, int in_syscall) { + sigset_t *oldset = sigmask_to_save(); DBG(1,"handle_signal: sig=%ld, ka=%p, info=%p, oldset=%p, regs=%p\n", sig, ka, info, oldset, regs); @@ -568,28 +569,17 @@ do_signal(struct pt_regs *regs, long in_syscall) siginfo_t info; struct k_sigaction ka; int signr; - sigset_t *oldset; - DBG(1,"\ndo_signal: oldset=0x%p, regs=0x%p, sr7 %#lx, in_syscall=%d\n", - oldset, regs, regs->sr[7], in_syscall); + DBG(1,"\ndo_signal: regs=0x%p, sr7 %#lx, in_syscall=%d\n", + regs, regs->sr[7], in_syscall); /* Everyone else checks to see if they are in kernel mode at this point and exits if that's the case. I'm not sure why we would be called in that case, but for some reason we are. */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - - DBG(1,"do_signal: oldset %08lx / %08lx\n", - oldset->sig[0], oldset->sig[1]); - - /* May need to force signal if handle_signal failed to deliver */ while (1) { - signr = get_signal_to_deliver(&info, &ka, regs, NULL); DBG(3,"do_signal: signr = %d, regs->gr[28] = %ld\n", signr, regs->gr[28]); @@ -603,8 +593,7 @@ do_signal(struct pt_regs *regs, long in_syscall) /* Whee! Actually deliver the signal. If the delivery failed, we need to continue to iterate in this loop so we can deliver the SIGSEGV... */ - if (handle_signal(signr, &info, &ka, oldset, - regs, in_syscall)) { + if (handle_signal(signr, &info, &ka, regs, in_syscall)) { DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n", regs->gr[28]); if (test_thread_flag(TIF_RESTORE_SIGMASK)) diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 0f4cc67f4268..8e9ddab7ade6 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -114,18 +114,13 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, static int do_signal(struct pt_regs *regs) { - sigset_t *oldset; + sigset_t *oldset = sigmask_to_save(); siginfo_t info; int signr; struct k_sigaction ka; int ret; int is32 = is_32bit_task(); - if (current_thread_info()->local_flags & _TLF_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); /* Is there any syscall restart business here ? */ diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 37799089c38e..c880c48a09f3 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -398,12 +398,7 @@ void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; - - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; + sigset_t *oldset = sigmask_to_save(); /* * Get signal to deliver. When running under ptrace, at this point diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index 9e751559375b..b24dfaf2462f 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -242,7 +242,7 @@ give_sigsegv: } static int handle_signal(unsigned long sig, siginfo_t *info, - struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) + struct k_sigaction *ka, struct pt_regs *regs) { int ret; @@ -269,7 +269,7 @@ static int handle_signal(unsigned long sig, siginfo_t *info, /* * Set up the stack frame */ - ret = setup_rt_frame(ka, regs, sig, oldset, info); + ret = setup_rt_frame(ka, regs, sig, sigmask_to_save(), info); if (ret == 0) block_sigmask(ka, sig); @@ -280,7 +280,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; - sigset_t *oldset; siginfo_t info; int signr; @@ -292,15 +291,10 @@ static void do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + if (handle_signal(signr, &info, &ka, regs) == 0) { /* * A signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 92f4173ad29a..bfb3d599f032 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -524,8 +524,9 @@ handle_syscall_restart(unsigned long save_r0, struct pt_regs *regs, */ static int handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *oldset, struct pt_regs *regs, unsigned int save_r0) + struct pt_regs *regs, unsigned int save_r0) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Set up the stack frame */ @@ -554,7 +555,6 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; /* * We want the common case to go fast, which @@ -565,17 +565,12 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0) if (!user_mode(regs)) return; - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { handle_syscall_restart(save_r0, regs, &ka.sa); /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &ka, &info, oldset, + if (handle_signal(signr, &ka, &info, regs, save_r0) == 0) { /* * A signal was successfully delivered; the saved diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 6e191ef0aa62..cc22d2b2e3f2 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -45,7 +45,7 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs * regs); + struct pt_regs * regs); static inline void handle_syscall_restart(struct pt_regs *regs, struct sigaction *sa) @@ -88,7 +88,6 @@ static void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; /* * We want the common case to go fast, which @@ -99,17 +98,12 @@ static void do_signal(struct pt_regs *regs) if (!user_mode(regs)) return; - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, 0); if (signr > 0) { handle_syscall_restart(regs, &ka.sa); /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + if (handle_signal(signr, &info, &ka, regs) == 0) { /* * If a signal was successfully delivered, the * saved sigmask is in its frame, and we can @@ -656,8 +650,9 @@ give_sigsegv: */ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs * regs) + struct pt_regs * regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Set up the stack frame */ diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 9dd97d2e171e..5d74410c787b 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -451,8 +451,9 @@ sigsegv: static inline int handle_signal(unsigned long signr, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) + siginfo_t *info, struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int err; if (ka->sa.sa_flags & SA_SIGINFO) @@ -498,7 +499,6 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) { struct k_sigaction ka; int restart_syscall; - sigset_t *oldset; siginfo_t info; int signr; @@ -523,11 +523,6 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) if (pt_regs_is_syscall(regs) && (regs->psr & PSR_C)) regs->u_regs[UREG_G6] = orig_i0; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); /* If the debugger messes with the program counter, it clears @@ -544,7 +539,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) if (signr > 0) { if (restart_syscall) syscall_restart(orig_i0, regs, &ka.sa); - if (handle_signal(signr, &ka, &info, oldset, regs) == 0) { + if (handle_signal(signr, &ka, &info, regs) == 0) { /* a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, * and will be restored by sigreturn, so we can simply diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 55b820ee0ac9..088a733f83f9 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -512,7 +512,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) { struct k_sigaction ka; int restart_syscall; - sigset_t *oldset; + sigset_t *oldset = sigmask_to_save(); siginfo_t info; int signr; @@ -538,11 +538,6 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) (regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY))) regs->u_regs[UREG_G6] = orig_i0; - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - #ifdef CONFIG_COMPAT if (test_thread_flag(TIF_32BIT)) { extern void do_signal32(sigset_t *, struct pt_regs *); diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index 62b3493ea77d..588c28b2db58 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -243,9 +243,10 @@ give_sigsegv: */ static int handle_signal(unsigned long sig, siginfo_t *info, - struct k_sigaction *ka, sigset_t *oldset, + struct k_sigaction *ka, struct pt_regs *regs) { + sigset_t *oldset = sigmask_to_save(); int ret; /* Are we from a system call? */ @@ -299,7 +300,6 @@ void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t *oldset; /* * i386 will check if we're coming from kernel mode and bail out @@ -308,15 +308,10 @@ void do_signal(struct pt_regs *regs) * helpful, we can reinstate the check on "!user_mode(regs)". */ - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + if (handle_signal(signr, &info, &ka, regs) == 0) { /* * A signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 6acf13c1740b..909e9b8d6612 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -23,9 +23,9 @@ EXPORT_SYMBOL(unblock_signals); * OK, we're invoking a handler */ static int handle_signal(struct pt_regs *regs, unsigned long signr, - struct k_sigaction *ka, siginfo_t *info, - sigset_t *oldset) + struct k_sigaction *ka, siginfo_t *info) { + sigset_t *oldset = sigmask_to_save(); unsigned long sp; int err; @@ -77,14 +77,9 @@ static int kern_do_signal(struct pt_regs *regs) int sig, handled_sig = 0; while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) { - sigset_t *oldset; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; handled_sig = 1; /* Whee! Actually deliver the signal. */ - if (!handle_signal(regs, sig, &ka_copy, &info, oldset)) { + if (!handle_signal(regs, sig, &ka_copy, &info)) { /* * a signal was successfully delivered; the saved * sigmask will have been stored in the signal frame, diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index 65a5ed3b6f2a..bf23194dc74d 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -313,12 +313,11 @@ static inline void setup_syscall_restart(struct pt_regs *regs) * OK, we're invoking a handler */ static int handle_signal(unsigned long sig, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, - struct pt_regs *regs, int syscall) + siginfo_t *info, struct pt_regs *regs, int syscall) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = current; - sigset_t blocked; + sigset_t *oldset = sigmask_to_save(); int usig = sig; int ret; @@ -404,13 +403,7 @@ static void do_signal(struct pt_regs *regs, int syscall) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - sigset_t *oldset; - - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - if (handle_signal(signr, &ka, &info, oldset, regs, syscall) + if (handle_signal(signr, &ka, &info, regs, syscall) == 0) { /* * A signal was successfully delivered; the saved diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 25a4a81a51aa..56f3062c5111 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -647,12 +647,9 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { int usig = signr_convert(sig); - sigset_t *set = ¤t->blocked; + sigset_t *set = sigmask_to_save(); int ret; - if (current_thread_info()->status & TS_RESTORE_SIGMASK) - set = ¤t->saved_sigmask; - /* Set up the stack frame */ if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 8c4e751e3b83..e4b06e2d4eb9 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -452,16 +452,10 @@ static void do_signal(struct pt_regs *regs) siginfo_t info; int signr; struct k_sigaction ka; - sigset_t oldset; if (try_to_freeze()) goto no_signal; - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - oldset = ¤t->saved_sigmask; - else - oldset = ¤t->blocked; - task_pt_regs(current)->icountlevel = 0; signr = get_signal_to_deliver(&info, &ka, regs, NULL); @@ -501,7 +495,7 @@ static void do_signal(struct pt_regs *regs) /* Whee! Actually deliver the signal. */ /* Set up the stack frame */ - ret = setup_frame(signr, &ka, &info, oldset, regs); + ret = setup_frame(signr, &ka, &info, sigmask_to_save(), regs); if (ret) return; diff --git a/include/linux/sched.h b/include/linux/sched.h index f1b46b88f6f5..ded3fb63fb06 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2213,6 +2213,14 @@ static inline void restore_saved_sigmask(void) set_current_blocked(¤t->saved_sigmask); } +static inline sigset_t *sigmask_to_save(void) +{ + sigset_t *res = ¤t->blocked; + if (unlikely(test_restore_sigmask())) + res = ¤t->saved_sigmask; + return res; +} + static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); -- cgit v1.2.3 From a610d6e672d6d3723e8da257ad4a8a288a8f2f89 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 May 2012 23:42:15 -0400 Subject: pull clearing RESTORE_SIGMASK into block_sigmask() Signed-off-by: Al Viro --- arch/alpha/kernel/signal.c | 5 ----- arch/arm/kernel/signal.c | 18 +++--------------- arch/avr32/kernel/signal.c | 11 ++++------- arch/blackfin/kernel/signal.c | 24 +++++++----------------- arch/c6x/kernel/signal.c | 23 ++++++----------------- arch/cris/arch-v10/kernel/signal.c | 14 ++------------ arch/cris/arch-v32/kernel/signal.c | 15 ++------------- arch/frv/kernel/signal.c | 24 +++++++----------------- arch/h8300/kernel/signal.c | 4 +--- arch/hexagon/kernel/signal.c | 26 ++++++-------------------- arch/ia64/kernel/signal.c | 10 +--------- arch/m32r/kernel/signal.c | 8 +++----- arch/m68k/kernel/signal.c | 2 -- arch/microblaze/kernel/signal.c | 17 +++-------------- arch/mips/kernel/signal.c | 18 +++--------------- arch/mn10300/kernel/signal.c | 18 +++++------------- arch/openrisc/kernel/signal.c | 2 -- arch/parisc/kernel/signal.c | 10 ++++------ arch/powerpc/kernel/signal.c | 7 ------- arch/s390/kernel/compat_signal.c | 10 +++++++--- arch/s390/kernel/entry.h | 2 +- arch/s390/kernel/signal.c | 32 +++++++++++--------------------- arch/score/kernel/signal.c | 24 +++++------------------- arch/sh/kernel/signal_32.c | 26 +++++++------------------- arch/sh/kernel/signal_64.c | 24 ++++++++---------------- arch/sparc/kernel/signal32.c | 15 +++------------ arch/sparc/kernel/signal_32.c | 16 +++------------- arch/sparc/kernel/signal_64.c | 15 +++------------ arch/tile/kernel/signal.c | 25 +++++-------------------- arch/um/kernel/signal.c | 16 ++-------------- arch/unicore32/kernel/signal.c | 18 +++--------------- arch/x86/kernel/signal.c | 31 +++++++++---------------------- arch/xtensa/kernel/signal.c | 1 - kernel/signal.c | 6 ++++++ 34 files changed, 130 insertions(+), 387 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index f1e7d2aa2586..bb45a8813393 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -481,11 +481,6 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info, return; } block_sigmask(ka, sig); - /* A signal was successfully delivered, and the - saved sigmask was stored on the signal frame, - and will be restored by sigreturn. So we can - simply clear the restore sigmask flag. */ - clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline void diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 2e66c93973c3..7f9abd75fc2e 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -528,7 +528,7 @@ setup_rt_frame(int usig, struct k_sigaction *ka, siginfo_t *info, /* * OK, we're invoking a handler */ -static int +static void handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { @@ -559,17 +559,14 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, if (ret != 0) { force_sigsegv(sig, tsk); - return ret; + return; } /* * Block the signal if we were successful. */ block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, 0); - - return 0; } /* @@ -633,16 +630,7 @@ static void do_signal(struct pt_regs *regs, int syscall) clear_thread_flag(TIF_SYSCALL_RESTARTSYS); } - if (handle_signal(signr, &ka, &info, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag. - */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - } + handle_signal(signr, &ka, &info, regs); return; } diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 0e2c0527c9fe..dc7875a0ad79 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -238,16 +238,13 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, */ ret |= !valid_user_regs(regs); - if (ret != 0) { - force_sigsegv(sig, current); - return; - } - /* * Block the signal if we were successful. */ - block_sigmask(ka, sig); - clear_thread_flag(TIF_RESTORE_SIGMASK); + if (ret != 0) + force_sigsegv(sig, current); + else + block_sigmask(ka, sig); } /* diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index 7f4205ddfa4d..b25cbfef8192 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -247,7 +247,7 @@ handle_restart(struct pt_regs *regs, struct k_sigaction *ka, int has_handler) /* * OK, we're invoking a handler */ -static int +static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { @@ -260,11 +260,12 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, /* set up the stack frame */ ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); + if (ret) + return; - if (ret == 0) - block_sigmask(ka, sig); - - return ret; + block_sigmask(ka, sig); + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } /* @@ -290,18 +291,7 @@ asmlinkage void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, regs) == 0) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - - tracehook_signal_handler(signr, &info, &ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - } - + handle_signal(signr, &info, &ka, regs); return; } diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index 38bb501eb117..f39346f1f2d6 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -248,7 +248,7 @@ do_restart: /* * handle the actual delivery of a signal to userspace */ -static int handle_signal(int sig, +static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int syscall) { @@ -277,11 +277,10 @@ static int handle_signal(int sig, } /* Set up the stack frame */ - ret = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); - if (ret == 0) - block_sigmask(ka, sig); - - return ret; + if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0) + return; + block_sigmask(ka, sig); + tracehook_signal_handler(sig, info, ka, regs, 0); } /* @@ -300,17 +299,7 @@ static void do_signal(struct pt_regs *regs, int syscall) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - if (handle_signal(signr, &info, &ka, regs, syscall) == 0) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - - tracehook_signal_handler(signr, &info, &ka, regs, 0); - } - + handle_signal(signr, &info, &ka, regs, syscall); return; } diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index 09a4cf4eb08a..46c8ca605e4d 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -415,7 +415,7 @@ give_sigsegv: * OK, we're invoking a handler */ -static inline int handle_signal(int canrestart, unsigned long sig, +static inline void handle_signal(int canrestart, unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { @@ -458,8 +458,6 @@ static inline int handle_signal(int canrestart, unsigned long sig, if (ret == 0) block_sigmask(ka, sig); - - return ret; } /* @@ -492,15 +490,7 @@ void do_signal(int canrestart, struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(canrestart, signr, &info, &ka, - regs)) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - } + handle_signal(canrestart, signr, &info, &ka, regs); return; } diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index d52276ddae4b..e0431328b7cd 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -434,7 +434,7 @@ give_sigsegv: } /* Invoke a signal handler to, well, handle the signal. */ -static inline int +static inline void handle_signal(int canrestart, unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs * regs) @@ -491,8 +491,6 @@ handle_signal(int canrestart, unsigned long sig, if (ret == 0) block_sigmask(ka, sig); - - return ret; } /* @@ -525,16 +523,7 @@ do_signal(int canrestart, struct pt_regs *regs) if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(canrestart, signr, &info, &ka, - regs)) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - } - + handle_signal(canrestart, signr, &info, &ka, regs); return; } diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 22efe8d25038..8dd0492bfb7b 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -426,7 +426,7 @@ give_sigsegv: /* * OK, we're invoking a handler */ -static int handle_signal(unsigned long sig, siginfo_t *info, +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka) { sigset_t *oldset = sigmask_to_save(); @@ -461,11 +461,12 @@ static int handle_signal(unsigned long sig, siginfo_t *info, else ret = setup_frame(sig, ka, oldset); - if (ret == 0) - block_sigmask(ka, sig); - - return ret; + if (ret) + return; + block_sigmask(ka, sig); + tracehook_signal_handler(sig, info, ka, __frame, + test_thread_flag(TIF_SINGLESTEP)); } /* end handle_signal() */ /*****************************************************************************/ @@ -495,18 +496,7 @@ static void do_signal(void) signr = get_signal_to_deliver(&info, &ka, __frame, NULL); if (signr > 0) { - if (handle_signal(signr, &info, &ka) == 0) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - - tracehook_signal_handler(signr, &info, &ka, __frame, - test_thread_flag(TIF_SINGLESTEP)); - } - + handle_signal(signr, &info, &ka); return; } diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index d4d2f72672ad..eac26c9ffc44 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -442,10 +442,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, else ret = setup_frame(sig, ka, oldset, regs); - if (!ret) { + if (!ret) block_sigmask(ka, sig); - clear_thread_flag(TIF_RESTORE_SIGMASK); - } } /* diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index f73fcee09bac..5f7d7c8a1328 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -149,11 +149,9 @@ sigsegv: /* * Setup invocation of signal handler */ -static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, +static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - int rc; - /* * If we're handling a signal that aborted a system call, * set up the error return value before adding the signal @@ -186,15 +184,13 @@ static int handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, * Set up the stack frame; not doing the SA_SIGINFO thing. We * only set up the rt_frame flavor. */ - rc = setup_rt_frame(sig, ka, info, sigmask_to_save(), regs); - /* If there was an error on setup, no signal was delivered. */ - if (rc) - return rc; + if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0) + return; block_sigmask(ka, sig); - - return 0; + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } /* @@ -215,17 +211,7 @@ static void do_signal(struct pt_regs *regs) signo = get_signal_to_deliver(&info, &sigact, regs, NULL); if (signo > 0) { - if (handle_signal(signo, &info, &sigact, regs) == 0) { - /* - * Successful delivery case. The saved sigmask is - * stored in the signal frame, and will be restored - * by sigreturn. We can clear the TIF flag. - */ - clear_thread_flag(TIF_RESTORE_SIGMASK); - - tracehook_signal_handler(signo, &info, &sigact, regs, - test_thread_flag(TIF_SINGLESTEP)); - } + handle_signal(signo, &info, &sigact, regs); return; } diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 9fee6d6a3f21..dc6fe6573465 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -501,16 +501,8 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) * Whee! Actually deliver the signal. If the delivery failed, we need to * continue to iterate in this loop so we can deliver the SIGSEGV... */ - if (handle_signal(signr, &ka, &info, scr)) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; + if (handle_signal(signr, &ka, &info, scr)) return; - } } /* Did we come from a system call? */ diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index e0d6d1079f33..970f46dbf24f 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -267,7 +267,7 @@ static int prev_insn(struct pt_regs *regs) * OK, we're invoking a handler */ -static int +static void handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { @@ -295,10 +295,9 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, /* Set up the stack frame */ if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs)) - return -EFAULT; + return; block_sigmask(ka, sig); - return 0; } /* @@ -333,8 +332,7 @@ static void do_signal(struct pt_regs *regs) */ /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &ka, &info, regs) == 0) - clear_thread_flag(TIF_RESTORE_SIGMASK); + handle_signal(signr, &ka, &info, regs); return; } diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index c83eb5a8ed8b..6dbee8a167a5 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1147,8 +1147,6 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info, regs->sr &= ~0x8000; send_sig(SIGTRAP, current, 1); } - - clear_thread_flag(TIF_RESTORE_SIGMASK); } /* diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index fd2de5718a4e..03641199666e 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -310,7 +310,7 @@ do_restart: * OK, we're invoking a handler */ -static int +static void handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { @@ -324,11 +324,9 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, ret = setup_rt_frame(sig, ka, NULL, oldset, regs); if (ret) - return ret; + return; block_sigmask(ka, sig); - - return 0; } /* @@ -356,16 +354,7 @@ static void do_signal(struct pt_regs *regs, int in_syscall) /* Whee! Actually deliver the signal. */ if (in_syscall) handle_restart(regs, &ka, 1); - if (!handle_signal(signr, &ka, &info, oldset, regs)) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= - ~TS_RESTORE_SIGMASK; - } + handle_signal(signr, &ka, &info, regs); return; } diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 18355060f241..53c6e90082f0 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -514,7 +514,7 @@ struct mips_abi mips_abi = { .restart = __NR_restart_syscall }; -static int handle_signal(unsigned long sig, siginfo_t *info, +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); @@ -551,11 +551,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info, ka, regs, sig, oldset); if (ret) - return ret; + return; block_sigmask(ka, sig); - - return ret; } static void do_signal(struct pt_regs *regs) @@ -575,17 +573,7 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag. - */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - } - + handle_signal(signr, &info, &ka, regs); return; } diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index 26a1d98c62a1..1715478f4e94 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -462,11 +462,12 @@ static int handle_signal(int sig, ret = setup_rt_frame(sig, ka, info, oldset, regs); else ret = setup_frame(sig, ka, oldset, regs); + if (ret) + return; - if (ret == 0) - block_sigmask(ka, sig); - - return ret; + block_sigmask(ka, sig); + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } /* @@ -486,15 +487,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { if (handle_signal(signr, &info, &ka, regs) == 0) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - - tracehook_signal_handler(signr, &info, &ka, regs, - test_thread_flag(TIF_SINGLESTEP)); } return; diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 6c41778410e6..aa1105c1618f 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -263,8 +263,6 @@ handle_signal(unsigned long sig, return; block_sigmask(ka, sig); - clear_thread_flag(TIF_RESTORE_SIGMASK); - tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 441b25992846..d6ddc572eba1 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -459,6 +459,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, test_thread_flag(TIF_SINGLESTEP) || test_thread_flag(TIF_BLOCKSTEP)); + DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n", + regs->gr[28]); + return 1; } @@ -593,13 +596,8 @@ do_signal(struct pt_regs *regs, long in_syscall) /* Whee! Actually deliver the signal. If the delivery failed, we need to continue to iterate in this loop so we can deliver the SIGSEGV... */ - if (handle_signal(signr, &info, &ka, regs, in_syscall)) { - DBG(1,KERN_DEBUG "do_signal: Exit (success), regs->gr[28] = %ld\n", - regs->gr[28]); - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); + if (handle_signal(signr, &info, &ka, regs, in_syscall)) return; - } } /* end of while(1) looping forever if we can't force a signal */ diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 8e9ddab7ade6..d926d2e4611a 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -159,13 +159,6 @@ static int do_signal(struct pt_regs *regs) regs->trap = 0; if (ret) { block_sigmask(&ka, signr); - - /* - * A signal was successfully delivered; the saved sigmask is in - * its frame, and we can clear the TLF_RESTORE_SIGMASK flag. - */ - current_thread_info()->local_flags &= ~_TLF_RESTORE_SIGMASK; - /* * Let tracing know that we've done the handler setup. */ diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 377c096ca4a7..233db1d68eee 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -572,7 +572,7 @@ give_sigsegv: * OK, we're invoking a handler */ -int handle_signal32(unsigned long sig, struct k_sigaction *ka, +void handle_signal32(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) { int ret; @@ -583,8 +583,12 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, else ret = setup_frame32(sig, ka, oldset, regs); if (ret) - return ret; + return; block_sigmask(ka, sig); - return 0; + /* + * Let tracing know that we've done the handler setup. + */ + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLE_STEP)); } diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 6cdddac93a2e..f66a229ab0b3 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -31,7 +31,7 @@ void do_per_trap(struct pt_regs *regs); void syscall_trace(struct pt_regs *regs, int entryexit); void kernel_stack_overflow(struct pt_regs * regs); void do_signal(struct pt_regs *regs); -int handle_signal32(unsigned long sig, struct k_sigaction *ka, +void handle_signal32(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, struct pt_regs *regs); void do_notify_resume(struct pt_regs *regs); diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index c880c48a09f3..7f9a862a161a 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -367,7 +367,7 @@ give_sigsegv: return -EFAULT; } -static int handle_signal(unsigned long sig, struct k_sigaction *ka, +static void handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) { @@ -379,9 +379,13 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka, else ret = setup_frame(sig, ka, oldset, regs); if (ret) - return ret; + return; block_sigmask(ka, sig); - return 0; + /* + * Let tracing know that we've done the handler setup. + */ + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLE_STEP)); } /* @@ -436,24 +440,10 @@ void do_signal(struct pt_regs *regs) /* No longer in a system call */ clear_thread_flag(TIF_SYSCALL); - if ((is_compat_task() ? - handle_signal32(signr, &ka, &info, oldset, regs) : - handle_signal(signr, &ka, &info, oldset, regs)) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag. - */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - - /* - * Let tracing know that we've done the handler setup. - */ - tracehook_signal_handler(signr, &info, &ka, regs, - test_thread_flag(TIF_SINGLE_STEP)); - } + if (is_compat_task()) + handle_signal32(signr, &ka, &info, oldset, regs); + else + handle_signal(signr, &ka, &info, oldset, regs); return; } diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index b24dfaf2462f..13e0eed0e301 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -241,11 +241,9 @@ give_sigsegv: return -EFAULT; } -static int handle_signal(unsigned long sig, siginfo_t *info, +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - int ret; - if (regs->is_syscall) { switch (regs->regs[4]) { case ERESTART_RESTARTBLOCK: @@ -269,12 +267,10 @@ static int handle_signal(unsigned long sig, siginfo_t *info, /* * Set up the stack frame */ - ret = setup_rt_frame(ka, regs, sig, sigmask_to_save(), info); - - if (ret == 0) - block_sigmask(ka, sig); + if (setup_rt_frame(ka, regs, sig, sigmask_to_save(), info) < 0) + return; - return ret; + block_sigmask(ka, sig); } static void do_signal(struct pt_regs *regs) @@ -294,17 +290,7 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag. - */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - } - + handle_signal(signr, &info, &ka, regs); return; } diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index bfb3d599f032..2675a97f374f 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -522,7 +522,7 @@ handle_syscall_restart(unsigned long save_r0, struct pt_regs *regs, /* * OK, we're invoking a handler */ -static int +static void handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs, unsigned int save_r0) { @@ -535,10 +535,11 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, else ret = setup_frame(sig, ka, oldset, regs); - if (ret == 0) - block_sigmask(ka, sig); - - return ret; + if (ret) + return; + block_sigmask(ka, sig); + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } /* @@ -570,20 +571,7 @@ static void do_signal(struct pt_regs *regs, unsigned int save_r0) handle_syscall_restart(save_r0, regs, &ka.sa); /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &ka, &info, - regs, save_r0) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - - tracehook_signal_handler(signr, &info, &ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - } - + handle_signal(signr, &ka, &info, regs, save_r0); return; } diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index aeeab070aaa9..7075c63bfc6f 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -43,7 +43,7 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) -static int +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs * regs); @@ -103,17 +103,7 @@ static void do_signal(struct pt_regs *regs) handle_syscall_restart(regs, &ka.sa); /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, regs) == 0) { - /* - * If a signal was successfully delivered, the - * saved sigmask is in its frame, and we can - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - - tracehook_signal_handler(signr, &info, &ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - } + handle_signal(signr, &info, &ka, regs); return; } @@ -648,7 +638,7 @@ give_sigsegv: /* * OK, we're invoking a handler */ -static int +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs * regs) { @@ -661,10 +651,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, else ret = setup_frame(sig, ka, oldset, regs); - if (ret == 0) - block_sigmask(ka, sig); + if (ret) + return; - return ret; + block_sigmask(ka, sig); + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 88e0d8122d2c..8c93c00922a7 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -775,7 +775,7 @@ sigsegv: return -EFAULT; } -static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka, +static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) { @@ -787,12 +787,10 @@ static inline int handle_signal32(unsigned long signr, struct k_sigaction *ka, err = setup_frame32(ka, regs, signr, oldset); if (err) - return err; + return; block_sigmask(ka, signr); tracehook_signal_handler(signr, info, ka, regs, 0); - - return 0; } static inline void syscall_restart32(unsigned long orig_i0, struct pt_regs *regs, @@ -841,14 +839,7 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs) if (signr > 0) { if (restart_syscall) syscall_restart32(orig_i0, regs, &ka.sa); - if (handle_signal32(signr, &ka, &info, oldset, regs) == 0) { - /* A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } + handle_signal32(signr, &ka, &info, oldset, regs); return; } if (restart_syscall && diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 5d74410c787b..ee81b90c532f 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -449,7 +449,7 @@ sigsegv: return -EFAULT; } -static inline int +static inline void handle_signal(unsigned long signr, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs) { @@ -462,12 +462,10 @@ handle_signal(unsigned long signr, struct k_sigaction *ka, err = setup_frame(ka, regs, signr, oldset); if (err) - return err; + return; block_sigmask(ka, signr); tracehook_signal_handler(signr, info, ka, regs, 0); - - return 0; } static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs, @@ -539,15 +537,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) if (signr > 0) { if (restart_syscall) syscall_restart(orig_i0, regs, &ka.sa); - if (handle_signal(signr, &ka, &info, regs) == 0) { - /* a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag. - */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - } + handle_signal(signr, &ka, &info, regs); return; } if (restart_syscall && diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 088a733f83f9..febbc4b697ba 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -466,7 +466,7 @@ sigsegv: return -EFAULT; } -static inline int handle_signal(unsigned long signr, struct k_sigaction *ka, +static inline void handle_signal(unsigned long signr, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, struct pt_regs *regs) { @@ -475,12 +475,10 @@ static inline int handle_signal(unsigned long signr, struct k_sigaction *ka, err = setup_rt_frame(ka, regs, signr, oldset, (ka->sa.sa_flags & SA_SIGINFO) ? info : NULL); if (err) - return err; + return; block_sigmask(ka, signr); tracehook_signal_handler(signr, info, ka, regs, 0); - - return 0; } static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs, @@ -558,14 +556,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) if (signr > 0) { if (restart_syscall) syscall_restart(orig_i0, regs, &ka.sa); - if (handle_signal(signr, &ka, &info, oldset, regs) == 0) { - /* A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } + handle_signal(signr, &ka, &info, oldset, regs); return; } if (restart_syscall && diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index 588c28b2db58..9b71bfd4913d 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -242,7 +242,7 @@ give_sigsegv: * OK, we're invoking a handler */ -static int handle_signal(unsigned long sig, siginfo_t *info, +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { @@ -279,15 +279,9 @@ static int handle_signal(unsigned long sig, siginfo_t *info, else #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret == 0) { - /* This code is only called from system calls or from - * the work_pending path in the return-to-user code, and - * either way we can re-enable interrupts unconditionally. - */ - block_sigmask(ka, sig); - } - - return ret; + if (ret) + return; + block_sigmask(ka, sig); } /* @@ -311,16 +305,7 @@ void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ - if (handle_signal(signr, &info, &ka, regs) == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TS_RESTORE_SIGMASK flag. - */ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - } - + handle_signal(signr, &info, &ka, regs); goto done; } diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 909e9b8d6612..549a51c8e54f 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL(unblock_signals); /* * OK, we're invoking a handler */ -static int handle_signal(struct pt_regs *regs, unsigned long signr, +static void handle_signal(struct pt_regs *regs, unsigned long signr, struct k_sigaction *ka, siginfo_t *info) { sigset_t *oldset = sigmask_to_save(); @@ -66,8 +66,6 @@ static int handle_signal(struct pt_regs *regs, unsigned long signr, force_sigsegv(signr, current); else block_sigmask(ka, signr); - - return err; } static int kern_do_signal(struct pt_regs *regs) @@ -79,17 +77,7 @@ static int kern_do_signal(struct pt_regs *regs) while ((sig = get_signal_to_deliver(&info, &ka_copy, regs, NULL)) > 0) { handled_sig = 1; /* Whee! Actually deliver the signal. */ - if (!handle_signal(regs, sig, &ka_copy, &info)) { - /* - * a signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag - */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - break; - } + handle_signal(regs, sig, &ka_copy, &info); } /* Did we come from a system call? */ diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index bf23194dc74d..dc41b11f8a57 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -312,7 +312,7 @@ static inline void setup_syscall_restart(struct pt_regs *regs) /* * OK, we're invoking a handler */ -static int handle_signal(unsigned long sig, struct k_sigaction *ka, +static void handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, struct pt_regs *regs, int syscall) { struct thread_info *thread = current_thread_info(); @@ -363,15 +363,13 @@ static int handle_signal(unsigned long sig, struct k_sigaction *ka, if (ret != 0) { force_sigsegv(sig, tsk); - return ret; + return; } /* * Block the signal if we were successful. */ block_sigmask(ka, sig); - - return 0; } /* @@ -403,17 +401,7 @@ static void do_signal(struct pt_regs *regs, int syscall) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - if (handle_signal(signr, &ka, &info, regs, syscall) - == 0) { - /* - * A signal was successfully delivered; the saved - * sigmask will have been stored in the signal frame, - * and will be restored by sigreturn, so we can simply - * clear the TIF_RESTORE_SIGMASK flag. - */ - if (test_thread_flag(TIF_RESTORE_SIGMASK)) - clear_thread_flag(TIF_RESTORE_SIGMASK); - } + handle_signal(signr, &ka, &info, regs, syscall); return; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 56f3062c5111..700c49dcd84e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -648,38 +648,27 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, { int usig = signr_convert(sig); sigset_t *set = sigmask_to_save(); - int ret; /* Set up the stack frame */ if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(usig, ka, info, set, regs); + return ia32_setup_rt_frame(usig, ka, info, set, regs); else - ret = ia32_setup_frame(usig, ka, set, regs); + return ia32_setup_frame(usig, ka, set, regs); #ifdef CONFIG_X86_X32_ABI } else if (is_x32) { - ret = x32_setup_rt_frame(usig, ka, info, + return x32_setup_rt_frame(usig, ka, info, (compat_sigset_t *)set, regs); #endif } else { - ret = __setup_rt_frame(sig, ka, info, set, regs); + return __setup_rt_frame(sig, ka, info, set, regs); } - - if (ret) { - force_sigsegv(sig, current); - return -EFAULT; - } - - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; - return ret; } -static int +static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - int ret; - /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -710,10 +699,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - ret = setup_rt_frame(sig, ka, info, regs); - - if (ret) - return ret; + if (setup_rt_frame(sig, ka, info, regs) < 0) { + force_sigsegv(sig, current); + return; + } /* * Clear the direction flag as per the ABI for function entry. @@ -732,8 +721,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); - - return 0; } #ifdef CONFIG_X86_32 diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index e4b06e2d4eb9..3e83913a3c7c 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -499,7 +499,6 @@ static void do_signal(struct pt_regs *regs) if (ret) return; - clear_thread_flag(TIF_RESTORE_SIGMASK); block_sigmask(&ka, signr); if (current->ptrace & PT_SINGLESTEP) task_pt_regs(current)->icountlevel = 1; diff --git a/kernel/signal.c b/kernel/signal.c index 95a9d9d8122b..b9be7e0fe41a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2382,6 +2382,12 @@ void block_sigmask(struct k_sigaction *ka, int signr) { sigset_t blocked; + /* A signal was successfully delivered, and the + saved sigmask was stored on the signal frame, + and will be restored by sigreturn. So we can + simply clear the restore sigmask flag. */ + clear_restore_sigmask(); + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) sigaddset(&blocked, signr); -- cgit v1.2.3 From edd63a2763bdae0daa4f0a4d4c5d61d1154352a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 27 Apr 2012 13:42:45 -0400 Subject: set_restore_sigmask() is never called without SIGPENDING (and never should be) Signed-off-by: Al Viro --- arch/ia64/include/asm/thread_info.h | 2 +- arch/microblaze/include/asm/thread_info.h | 2 +- arch/powerpc/include/asm/thread_info.h | 2 +- arch/sh/include/asm/thread_info.h | 2 +- arch/sparc/include/asm/thread_info_64.h | 2 +- arch/tile/include/asm/thread_info.h | 2 +- arch/x86/include/asm/thread_info.h | 2 +- include/linux/thread_info.h | 3 ++- 8 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 8d600363fa57..f7ee85378311 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -141,7 +141,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, &ti->flags); + WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags)); } static inline void clear_restore_sigmask(void) { diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 12e39206b3ef..6c610234ffab 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -166,7 +166,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); + WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags)); } static inline void clear_restore_sigmask(void) { diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 85d50a93a92f..68831e9cf82f 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -140,7 +140,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->local_flags |= _TLF_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, &ti->flags); + WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags)); } static inline void clear_restore_sigmask(void) { diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index a109157c6b8f..bc13b57cdc83 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -169,7 +169,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); + WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags)); } #define TI_FLAG_FAULT_CODE_SHIFT 24 diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index cb9b7a9f5fc1..cfa8c38fb9c8 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -238,7 +238,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, &ti->flags); + WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags)); } static inline void clear_restore_sigmask(void) { diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 5aef371921e4..7e1fef36bde6 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -166,7 +166,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, &ti->flags); + WARN_ON(!test_bit(TIF_SIGPENDING, &ti->flags)); } static inline void clear_restore_sigmask(void) { diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8f3f1ff69fa9..89f794f007ec 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -248,7 +248,7 @@ static inline void set_restore_sigmask(void) { struct thread_info *ti = current_thread_info(); ti->status |= TS_RESTORE_SIGMASK; - set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags); + WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags)); } static inline void clear_restore_sigmask(void) { diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ed279701ac79..ccc1899bd62e 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -8,6 +8,7 @@ #define _LINUX_THREAD_INFO_H #include +#include struct timespec; struct compat_timespec; @@ -125,7 +126,7 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); - set_thread_flag(TIF_SIGPENDING); + WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } static inline void clear_restore_sigmask(void) { -- cgit v1.2.3 From 77097ae503b170120ab66dd1d547f8577193f91f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 27 Apr 2012 13:58:59 -0400 Subject: most of set_current_blocked() callers want SIGKILL/SIGSTOP removed from set Only 3 out of 63 do not. Renamed the current variant to __set_current_blocked(), added set_current_blocked() that will exclude unblockable signals, switched open-coded instances to it. Signed-off-by: Al Viro --- arch/alpha/kernel/signal.c | 2 -- arch/arm/kernel/signal.c | 6 +----- arch/avr32/kernel/signal.c | 3 --- arch/blackfin/kernel/signal.c | 3 --- arch/c6x/kernel/signal.c | 3 --- arch/cris/arch-v10/kernel/signal.c | 4 ---- arch/cris/arch-v32/kernel/signal.c | 5 ----- arch/frv/kernel/signal.c | 4 ---- arch/h8300/kernel/signal.c | 4 ---- arch/hexagon/kernel/signal.c | 3 --- arch/ia64/kernel/signal.c | 2 -- arch/m32r/kernel/signal.c | 3 --- arch/m68k/kernel/signal.c | 4 ---- arch/microblaze/kernel/signal.c | 3 --- arch/mips/kernel/signal-common.h | 2 -- arch/mips/kernel/signal.c | 2 -- arch/mips/kernel/signal32.c | 2 -- arch/mips/kernel/signal_n32.c | 1 - arch/mn10300/kernel/signal.c | 4 ---- arch/openrisc/kernel/signal.c | 3 --- arch/parisc/kernel/signal.c | 4 ---- arch/parisc/kernel/signal32.c | 2 -- arch/powerpc/kernel/signal.c | 1 - arch/powerpc/kernel/signal.h | 2 -- arch/s390/kernel/compat_signal.c | 4 ---- arch/s390/kernel/signal.c | 5 ----- arch/score/kernel/signal.c | 3 --- arch/sh/kernel/signal_32.c | 4 ---- arch/sh/kernel/signal_64.c | 4 ---- arch/sparc/kernel/signal32.c | 4 ---- arch/sparc/kernel/signal_32.c | 4 ---- arch/sparc/kernel/signal_64.c | 4 ---- arch/tile/kernel/compat_signal.c | 3 --- arch/tile/kernel/signal.c | 3 --- arch/um/include/shared/frame_kern.h | 3 --- arch/um/kernel/signal.c | 4 ---- arch/unicore32/kernel/signal.c | 6 +----- arch/x86/ia32/ia32_signal.c | 2 -- arch/x86/include/asm/sighandling.h | 2 -- arch/x86/kernel/signal.c | 3 --- arch/x86/um/signal.c | 2 -- arch/xtensa/kernel/signal.c | 3 --- include/linux/sched.h | 2 +- include/linux/signal.h | 3 ++- kernel/signal.c | 18 ++++++++++++------ 45 files changed, 17 insertions(+), 141 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index bb45a8813393..48c4df2389ac 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -226,7 +226,6 @@ do_sigreturn(struct sigcontext __user *sc, struct pt_regs *regs, if (__get_user(set.sig[0], &sc->sc_mask)) goto give_sigsegv; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(sc, regs, sw)) @@ -261,7 +260,6 @@ do_rt_sigreturn(struct rt_sigframe __user *frame, struct pt_regs *regs, if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto give_sigsegv; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(&frame->uc.uc_mcontext, regs, sw)) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 7f9abd75fc2e..c126eba8411d 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -22,8 +22,6 @@ #include "signal.h" -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* * For ARM syscalls, we encode the syscall number into the instruction. */ @@ -210,10 +208,8 @@ static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf) int err; err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); - if (err == 0) { - sigdelsetmask(&set, ~_BLOCKABLE); + if (err == 0) set_current_blocked(&set); - } __get_user_error(regs->ARM_r0, &sf->uc.uc_mcontext.arm_r0, err); __get_user_error(regs->ARM_r1, &sf->uc.uc_mcontext.arm_r1, err); diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 3ac1a60f9eb6..e883fa5eb845 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -22,8 +22,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { @@ -89,7 +87,6 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index b20d435d084a..463612643821 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -19,8 +19,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* Location of the trace bit in SYSCFG. */ #define TRACE_BITS 0x0001 @@ -98,7 +96,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (rt_restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0)) diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index d599a7fb5d24..eb1b3086ae00 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -20,8 +20,6 @@ #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* * Do a signal return, undo the signal stack. */ @@ -87,7 +85,6 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index 46c8ca605e4d..cf6380cb9a57 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -31,8 +31,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* a syscall in Linux/CRIS is a break 13 instruction which is 2 bytes */ /* manipulate regs so that upon return, it will be re-executed */ @@ -176,7 +174,6 @@ asmlinkage int sys_sigreturn(long r10, long r11, long r12, long r13, long mof, sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc)) @@ -212,7 +209,6 @@ asmlinkage int sys_rt_sigreturn(long r10, long r11, long r12, long r13, if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index e0431328b7cd..07b81ee09f65 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -24,9 +24,6 @@ extern unsigned long cris_signal_return_page; -/* Flag to check if a signal is blockable. */ -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* * A syscall in CRIS is really a "break 13" instruction, which is 2 * bytes. The registers is manipulated so upon return the instruction @@ -167,7 +164,6 @@ sys_sigreturn(long r10, long r11, long r12, long r13, long mof, long srp, sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc)) @@ -208,7 +204,6 @@ sys_rt_sigreturn(long r10, long r11, long r12, long r13, long mof, long srp, if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 9ec3d2e27b4c..511285fa2461 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -28,8 +28,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - struct fdpic_func_descriptor { unsigned long text; unsigned long GOT; @@ -149,7 +147,6 @@ asmlinkage int sys_sigreturn(void) __copy_from_user(&set.sig[1], &frame->extramask, sizeof(frame->extramask))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(&frame->sc, &gr8)) @@ -172,7 +169,6 @@ asmlinkage int sys_rt_sigreturn(void) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(&frame->uc.uc_mcontext, &gr8)) diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 8fbfc39574f5..aa6f09666915 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -47,8 +47,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* * Atomically swap in the new signal mask, and wait for a signal. */ @@ -186,7 +184,6 @@ asmlinkage int do_sigreturn(unsigned long __unused,...) sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc, &er0)) @@ -211,7 +208,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused,...) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &er0)) diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index c9caf7401191..439f11a3a8ef 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -31,8 +31,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - struct rt_sigframe { unsigned long tramp[2]; struct siginfo info; @@ -273,7 +271,6 @@ asmlinkage int sys_rt_sigreturn(void) if (__copy_from_user(&blocked, &frame->uc.uc_sigmask, sizeof(blocked))) goto badframe; - sigdelsetmask(&blocked, ~_BLOCKABLE); set_current_blocked(&blocked); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index dc6fe6573465..c4041c76c07d 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -30,7 +30,6 @@ #define DEBUG_SIG 0 #define STACK_ALIGN 16 /* minimal alignment for stack pointer */ -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) #if _NSIG_WORDS > 1 # define PUT_SIGSET(k,u) __copy_to_user((u)->sig, (k)->sig, sizeof(sigset_t)) @@ -200,7 +199,6 @@ ia64_rt_sigreturn (struct sigscratch *scr) if (GET_SIGSET(&set, &sc->sc_mask)) goto give_sigsegv; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(sc, scr)) diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 7cbfa639fbfa..07f9032576c0 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -28,8 +28,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, unsigned long r2, unsigned long r3, unsigned long r4, @@ -111,7 +109,6 @@ sys_rt_sigreturn(unsigned long r0, unsigned long r1, if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &result)) diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 6dbee8a167a5..c00caad215a6 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -51,8 +51,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - #ifdef CONFIG_MMU /* @@ -795,7 +793,6 @@ asmlinkage int do_sigreturn(unsigned long __unused) sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc, frame + 1)) @@ -820,7 +817,6 @@ asmlinkage int do_rt_sigreturn(unsigned long __unused) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (rt_restore_ucontext(regs, sw, &frame->uc)) diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 03641199666e..c662e68671a2 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -41,8 +41,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) @@ -106,7 +104,6 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &rval)) diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h index 10263b405981..9c60d09e62a7 100644 --- a/arch/mips/kernel/signal-common.h +++ b/arch/mips/kernel/signal-common.h @@ -19,8 +19,6 @@ # define DEBUGP(fmt, args...) #endif -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* * Determine which stack to use.. */ diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 896165757e6f..02e0cba24f82 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -339,7 +339,6 @@ asmlinkage void sys_sigreturn(nabi_no_regargs struct pt_regs regs) if (__copy_from_user(&blocked, &frame->sf_mask, sizeof(blocked))) goto badframe; - sigdelsetmask(&blocked, ~_BLOCKABLE); set_current_blocked(&blocked); sig = restore_sigcontext(®s, &frame->sf_sc); @@ -375,7 +374,6 @@ asmlinkage void sys_rt_sigreturn(nabi_no_regargs struct pt_regs regs) if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); sig = restore_sigcontext(®s, &frame->rs_uc.uc_mcontext); diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index b4fe2eacbd5d..da1b56a39ac7 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -465,7 +465,6 @@ asmlinkage void sys32_sigreturn(nabi_no_regargs struct pt_regs regs) if (__copy_conv_sigset_from_user(&blocked, &frame->sf_mask)) goto badframe; - sigdelsetmask(&blocked, ~_BLOCKABLE); set_current_blocked(&blocked); sig = restore_sigcontext32(®s, &frame->sf_sc); @@ -503,7 +502,6 @@ asmlinkage void sys32_rt_sigreturn(nabi_no_regargs struct pt_regs regs) if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask)) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); sig = restore_sigcontext32(®s, &frame->rs_uc.uc_mcontext); diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index 63ffac9af7c5..3574c145511b 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -109,7 +109,6 @@ asmlinkage void sysn32_rt_sigreturn(nabi_no_regargs struct pt_regs regs) if (__copy_conv_sigset_from_user(&set, &frame->rs_uc.uc_sigmask)) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); sig = restore_sigcontext(®s, &frame->rs_uc.uc_mcontext); diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index d57013e06ea0..4f6d20763061 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -31,8 +31,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* * atomically swap in the new signal mask, and wait for a signal. */ @@ -163,7 +161,6 @@ asmlinkage long sys_sigreturn(void) sizeof(frame->extramask))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(current_frame(), &frame->sc, &d0)) @@ -191,7 +188,6 @@ asmlinkage long sys_rt_sigreturn(void) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(current_frame(), &frame->uc.uc_mcontext, &d0)) diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index aa1105c1618f..53972b7260b7 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -33,8 +33,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - asmlinkage long _sys_sigaltstack(const stack_t *uss, stack_t *uoss, struct pt_regs *regs) { @@ -101,7 +99,6 @@ asmlinkage long _sys_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 7f3c8f2c962d..25161eaf720d 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -48,9 +48,6 @@ #define DBG(LEVEL, ...) #endif - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* gcc will complain if a pointer is cast to an integer of different * size. If you really need to do this (and we do for an ELF32 user * application in an ELF64 kernel) then you have to do a cast to an @@ -131,7 +128,6 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall) goto give_sigsegv; } - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); /* Good thing we saved the old gr[30], eh? */ diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c index e14132430762..fd49aeda9eb8 100644 --- a/arch/parisc/kernel/signal32.c +++ b/arch/parisc/kernel/signal32.c @@ -47,8 +47,6 @@ #define DBG(LEVEL, ...) #endif -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - inline void sigset_32to64(sigset_t *s64, compat_sigset_t *s32) { diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index d926d2e4611a..3a3413c049c3 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -57,7 +57,6 @@ void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, */ void restore_sigmask(sigset_t *set) { - sigdelsetmask(set, ~_BLOCKABLE); set_current_blocked(set); } diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index 8dde973aaaf5..11439ea18ed4 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -10,8 +10,6 @@ #ifndef _POWERPC_ARCH_SIGNAL_H #define _POWERPC_ARCH_SIGNAL_H -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); extern void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 233db1d68eee..923baa96c0b0 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -32,8 +32,6 @@ #include "compat_ptrace.h" #include "entry.h" -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - typedef struct { __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; @@ -364,7 +362,6 @@ asmlinkage long sys32_sigreturn(void) goto badframe; if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32)) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigregs32(regs, &frame->sregs)) goto badframe; @@ -390,7 +387,6 @@ asmlinkage long sys32_rt_sigreturn(void) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) goto badframe; diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 7f9a862a161a..8332a6943384 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -33,9 +33,6 @@ #include #include "entry.h" -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - - typedef struct { __u8 callee_used_stack[__SIGNAL_FRAMESIZE]; @@ -169,7 +166,6 @@ SYSCALL_DEFINE0(sigreturn) goto badframe; if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE)) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigregs(regs, &frame->sregs)) goto badframe; @@ -189,7 +185,6 @@ SYSCALL_DEFINE0(rt_sigreturn) goto badframe; if (__copy_from_user(&set.sig, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigregs(regs, &frame->uc.uc_mcontext)) goto badframe; diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index 13e0eed0e301..f1b3fef0907b 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -34,8 +34,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - struct rt_sigframe { u32 rs_ass[4]; /* argument save space */ u32 rs_code[2]; /* signal trampoline */ @@ -162,7 +160,6 @@ score_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->rs_uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); sig = restore_sigcontext(regs, &frame->rs_uc.uc_mcontext); diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 2675a97f374f..e4a531414e19 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -32,8 +32,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - struct fdpic_func_descriptor { unsigned long text; unsigned long GOT; @@ -226,7 +224,6 @@ asmlinkage int sys_sigreturn(unsigned long r4, unsigned long r5, sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc, &r0)) @@ -256,7 +253,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long r4, unsigned long r5, if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &r0)) diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 7075c63bfc6f..75960ef6c1d1 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -41,8 +41,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - static void handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs * regs); @@ -330,7 +328,6 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3, sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc, &ret)) @@ -363,7 +360,6 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3, if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ret)) diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 8c93c00922a7..ba3dbfcdb28e 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -32,8 +32,6 @@ #include "sigutil.h" -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* This magic should be in g_upper[0] for all upper parts * to be valid. */ @@ -274,7 +272,6 @@ void do_sigreturn32(struct pt_regs *regs) case 2: set.sig[1] = seta[2] + (((long)seta[3]) << 32); case 1: set.sig[0] = seta[0] + (((long)seta[1]) << 32); } - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); return; @@ -376,7 +373,6 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs) case 2: set.sig[1] = seta.sig[2] + (((long)seta.sig[3]) << 32); case 1: set.sig[0] = seta.sig[0] + (((long)seta.sig[1]) << 32); } - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); return; segv: diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index f6722427203d..1bfa854be602 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -29,8 +29,6 @@ #include "sigutil.h" -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - extern void fpsave(unsigned long *fpregs, unsigned long *fsr, void *fpqueue, unsigned long *fpqdepth); extern void fpload(unsigned long *fpregs, unsigned long *fsr); @@ -130,7 +128,6 @@ asmlinkage void do_sigreturn(struct pt_regs *regs) if (err) goto segv_and_exit; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); return; @@ -197,7 +194,6 @@ asmlinkage void do_rt_sigreturn(struct pt_regs *regs) goto segv; } - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); return; segv: diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index febbc4b697ba..23b60caa6c43 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -38,8 +38,6 @@ #include "systbls.h" #include "sigutil.h" -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* {set, get}context() needed for 64-bit SparcLinux userland. */ asmlinkage void sparc64_set_context(struct pt_regs *regs) { @@ -71,7 +69,6 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs) if (__copy_from_user(&set, &ucp->uc_sigmask, sizeof(sigset_t))) goto do_sigsegv; } - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); } if (test_thread_flag(TIF_32BIT)) { @@ -315,7 +312,6 @@ void do_rt_sigreturn(struct pt_regs *regs) /* Prevent syscall restart. */ pt_regs_clear_syscall(regs); - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); return; segv: diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index cdef6e5ec022..474571b84085 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -118,8 +118,6 @@ struct compat_rt_sigframe { struct compat_ucontext uc; }; -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - long compat_sys_rt_sigaction(int sig, struct compat_sigaction __user *act, struct compat_sigaction __user *oact, size_t sigsetsize) @@ -302,7 +300,6 @@ long compat_sys_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index 9b71bfd4913d..e068aa0c6dfc 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -37,8 +37,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - SYSCALL_DEFINE3(sigaltstack, const stack_t __user *, uss, stack_t __user *, uoss, struct pt_regs *, regs) { @@ -96,7 +94,6 @@ SYSCALL_DEFINE1(rt_sigreturn, struct pt_regs *, regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) diff --git a/arch/um/include/shared/frame_kern.h b/arch/um/include/shared/frame_kern.h index 76078490c258..e584e40ee832 100644 --- a/arch/um/include/shared/frame_kern.h +++ b/arch/um/include/shared/frame_kern.h @@ -6,9 +6,6 @@ #ifndef __FRAME_KERN_H_ #define __FRAME_KERN_H_ -#define _S(nr) (1<<((nr)-1)) -#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) - extern int setup_signal_stack_sc(unsigned long stack_top, int sig, struct k_sigaction *ka, struct pt_regs *regs, diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 549a51c8e54f..4ce6ab2d2996 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -15,10 +15,6 @@ EXPORT_SYMBOL(block_signals); EXPORT_SYMBOL(unblock_signals); -#define _S(nr) (1<<((nr)-1)) - -#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) - /* * OK, we're invoking a handler */ diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index af962e57efb2..4d9c4841989d 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -21,8 +21,6 @@ #include #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - /* * For UniCore syscalls, we encode the syscall number into the instruction. */ @@ -61,10 +59,8 @@ static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf) int err; err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); - if (err == 0) { - sigdelsetmask(&set, ~_BLOCKABLE); + if (err == 0) set_current_blocked(&set); - } err |= __get_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00); err |= __get_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01); diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 98bd70faccc5..daeca56211e3 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -273,7 +273,6 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs) sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) @@ -299,7 +298,6 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index ada93b3b8c66..beff97f7df37 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -7,8 +7,6 @@ #include -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - #define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 700c49dcd84e..11e206f0f45a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -555,7 +555,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs) sizeof(frame->extramask)))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->sc, &ax)) @@ -581,7 +580,6 @@ long sys_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) @@ -915,7 +913,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index bb0fb03b9f85..a508cea13503 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -486,7 +486,6 @@ long sys_sigreturn(struct pt_regs *regs) copy_from_user(&set.sig[1], extramask, sig_size)) goto segfault; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (copy_sc_from_user(¤t->thread.regs, sc)) @@ -600,7 +599,6 @@ long sys_rt_sigreturn(struct pt_regs *regs) if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) goto segfault; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index ca98b86ef9a7..4da3c6f6d929 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -30,8 +30,6 @@ #define DEBUG_SIG 0 -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - extern struct task_struct *coproc_owners[]; struct rt_sigframe @@ -261,7 +259,6 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3, if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; - sigdelsetmask(&set, ~_BLOCKABLE); set_current_blocked(&set); if (restore_sigcontext(regs, frame)) diff --git a/include/linux/sched.h b/include/linux/sched.h index ded3fb63fb06..f34437e835a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2210,7 +2210,7 @@ extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned lon static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) - set_current_blocked(¤t->saved_sigmask); + __set_current_blocked(¤t->saved_sigmask); } static inline sigset_t *sigmask_to_save(void) diff --git a/include/linux/signal.h b/include/linux/signal.h index 17046cc484bc..065e76330398 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -250,7 +250,8 @@ extern long do_sigpending(void __user *, unsigned long); extern int do_sigtimedwait(const sigset_t *, siginfo_t *, const struct timespec *); extern int sigprocmask(int, sigset_t *, sigset_t *); -extern void set_current_blocked(const sigset_t *); +extern void set_current_blocked(sigset_t *); +extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern int sigsuspend(sigset_t *); diff --git a/kernel/signal.c b/kernel/signal.c index b9be7e0fe41a..df8d721a9e6f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2524,7 +2524,16 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) * It is wrong to change ->blocked directly, this helper should be used * to ensure the process can't miss a shared signal we are going to block. */ -void set_current_blocked(const sigset_t *newset) +void set_current_blocked(sigset_t *newset) +{ + struct task_struct *tsk = current; + sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, newset); + spin_unlock_irq(&tsk->sighand->siglock); +} + +void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; @@ -2564,7 +2573,7 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return -EINVAL; } - set_current_blocked(&newset); + __set_current_blocked(&newset); return 0; } @@ -3138,7 +3147,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, return -EINVAL; } - set_current_blocked(&new_blocked); + __set_current_blocked(&new_blocked); } if (oset) { @@ -3202,7 +3211,6 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) int old = current->blocked.sig[0]; sigset_t newset; - siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); set_current_blocked(&newset); return old; @@ -3243,8 +3251,6 @@ SYSCALL_DEFINE0(pause) int sigsuspend(sigset_t *set) { - sigdelsetmask(set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - current->saved_sigmask = current->blocked; set_current_blocked(set); -- cgit v1.2.3 From efee984c27b67e3ebef40410f35671997441b57c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 28 Apr 2012 02:04:15 -0400 Subject: new helper: signal_delivered() Does block_sigmask() + tracehook_signal_handler(); called when sigframe has been successfully built. All architectures converted to it; block_sigmask() itself is gone now (merged into this one). I'm still not too happy with the signature, but that's a separate story (IMO we need a structure that would contain signal number + siginfo + k_sigaction, so that get_signal_to_deliver() would fill one, signal_delivered(), handle_signal() and probably setup...frame() - take one). Signed-off-by: Al Viro --- arch/alpha/kernel/signal.c | 2 +- arch/arm/kernel/signal.c | 7 +------ arch/avr32/kernel/signal.c | 2 +- arch/blackfin/kernel/signal.c | 3 +-- arch/c6x/kernel/signal.c | 3 +-- arch/cris/arch-v10/kernel/signal.c | 2 +- arch/cris/arch-v32/kernel/signal.c | 2 +- arch/frv/kernel/signal.c | 3 +-- arch/h8300/kernel/signal.c | 2 +- arch/hexagon/kernel/signal.c | 3 +-- arch/ia64/kernel/signal.c | 7 +------ arch/m32r/kernel/signal.c | 2 +- arch/m68k/kernel/signal.c | 2 +- arch/microblaze/kernel/signal.c | 2 +- arch/mips/kernel/signal.c | 2 +- arch/mn10300/kernel/signal.c | 3 +-- arch/openrisc/kernel/signal.c | 3 +-- arch/parisc/kernel/signal.c | 4 +--- arch/powerpc/kernel/signal.c | 6 +----- arch/s390/kernel/compat_signal.c | 6 +----- arch/s390/kernel/signal.c | 6 +----- arch/score/kernel/signal.c | 2 +- arch/sh/kernel/signal_32.c | 3 +-- arch/sh/kernel/signal_64.c | 3 +-- arch/sparc/kernel/signal32.c | 3 +-- arch/sparc/kernel/signal_32.c | 3 +-- arch/sparc/kernel/signal_64.c | 3 +-- arch/tile/kernel/signal.c | 2 +- arch/um/kernel/signal.c | 2 +- arch/unicore32/kernel/signal.c | 5 +---- arch/x86/kernel/signal.c | 6 ++---- arch/xtensa/kernel/signal.c | 2 +- include/linux/signal.h | 2 +- kernel/signal.c | 22 +++++++++++++--------- 34 files changed, 47 insertions(+), 83 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 48c4df2389ac..a8c97d42ec8e 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -478,7 +478,7 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info, force_sigsegv(sig, current); return; } - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } static inline void diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index c126eba8411d..fd2392a17ac1 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -557,12 +557,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, force_sigsegv(sig, tsk); return; } - - /* - * Block the signal if we were successful. - */ - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, 0); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index e883fa5eb845..c140f9b41dce 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -241,7 +241,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, if (ret != 0) force_sigsegv(sig, current); else - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index 463612643821..35459e681483 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -260,8 +260,7 @@ handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index eb1b3086ae00..3d8f3c22a94f 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -276,8 +276,7 @@ static void handle_signal(int sig, /* Set up the stack frame */ if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, 0); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index cf6380cb9a57..0bb477c13a4e 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -453,7 +453,7 @@ static inline void handle_signal(int canrestart, unsigned long sig, ret = setup_frame(sig, ka, oldset, regs); if (ret == 0) - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index 07b81ee09f65..b60d1b65a426 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -485,7 +485,7 @@ handle_signal(int canrestart, unsigned long sig, ret = setup_frame(sig, ka, oldset, regs); if (ret == 0) - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 511285fa2461..4e134c7eceea 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -460,8 +460,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info, if (ret) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, __frame, + signal_delivered(sig, info, ka, __frame, test_thread_flag(TIF_SINGLESTEP)); } /* end handle_signal() */ diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index aa6f09666915..fca10378701b 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -439,7 +439,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, ret = setup_frame(sig, ka, oldset, regs); if (!ret) - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index 439f11a3a8ef..304b0808d072 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -186,8 +186,7 @@ static void handle_signal(int sig, siginfo_t *info, struct k_sigaction *ka, if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs) < 0) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index c4041c76c07d..a199be1fe619 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -419,12 +419,7 @@ handle_signal (unsigned long sig, struct k_sigaction *ka, siginfo_t *info, if (!setup_frame(sig, ka, info, sigmask_to_save(), scr)) return 0; - block_sigmask(ka, sig); - - /* - * Let tracing know that we've done the handler setup. - */ - tracehook_signal_handler(sig, info, ka, &scr->pt, + signal_delivered(sig, info, ka, &scr->pt, test_thread_flag(TIF_SINGLESTEP)); return 1; diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 07f9032576c0..f3fb2c029cfc 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -294,7 +294,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, if (setup_rt_frame(sig, ka, info, sigmask_to_save(), regs)) return; - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index c00caad215a6..710a528b928b 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -1137,7 +1137,7 @@ handle_signal(int sig, struct k_sigaction *ka, siginfo_t *info, if (err) return; - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); if (test_thread_flag(TIF_DELAYED_TRACE)) { regs->sr &= ~0x8000; diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index c662e68671a2..76b9722557db 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -323,7 +323,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, if (ret) return; - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 02e0cba24f82..f2c09cfc60ac 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -551,7 +551,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info, if (ret) return; - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } static void do_signal(struct pt_regs *regs) diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index 4f6d20763061..6ab0bee2a54f 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -461,8 +461,7 @@ static int handle_signal(int sig, if (ret) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 53972b7260b7..30110297f4f9 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -259,8 +259,7 @@ handle_signal(unsigned long sig, if (ret) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 25161eaf720d..594459bde14e 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -449,9 +449,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (!setup_rt_frame(sig, ka, info, oldset, regs, in_syscall)) return 0; - block_sigmask(ka, sig); - - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP) || test_thread_flag(TIF_BLOCKSTEP)); diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 129bdffc6daf..5c023c9cf16e 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -148,11 +148,7 @@ static int do_signal(struct pt_regs *regs) regs->trap = 0; if (ret) { - block_sigmask(&ka, signr); - /* - * Let tracing know that we've done the handler setup. - */ - tracehook_signal_handler(signr, &info, &ka, regs, + signal_delivered(signr, &info, &ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 923baa96c0b0..3c0c19830c37 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -580,11 +580,7 @@ void handle_signal32(unsigned long sig, struct k_sigaction *ka, ret = setup_frame32(sig, ka, oldset, regs); if (ret) return; - block_sigmask(ka, sig); - /* - * Let tracing know that we've done the handler setup. - */ - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLE_STEP)); } diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 8332a6943384..ac565b44aabb 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -375,11 +375,7 @@ static void handle_signal(unsigned long sig, struct k_sigaction *ka, ret = setup_frame(sig, ka, oldset, regs); if (ret) return; - block_sigmask(ka, sig); - /* - * Let tracing know that we've done the handler setup. - */ - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLE_STEP)); } diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index f1b3fef0907b..e382c52ca0d9 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -267,7 +267,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info, if (setup_rt_frame(ka, regs, sig, sigmask_to_save(), info) < 0) return; - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } static void do_signal(struct pt_regs *regs) diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index e4a531414e19..d6b7b6154f87 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -533,8 +533,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, if (ret) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 75960ef6c1d1..6b5b3dfe886b 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -650,8 +650,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return; - block_sigmask(ka, sig); - tracehook_signal_handler(sig, info, ka, regs, + signal_delivered(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); } diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index ba3dbfcdb28e..a53e0a5fd3a3 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -785,8 +785,7 @@ static inline void handle_signal32(unsigned long signr, struct k_sigaction *ka, if (err) return; - block_sigmask(ka, signr); - tracehook_signal_handler(signr, info, ka, regs, 0); + signal_delivered(signr, info, ka, regs, 0); } static inline void syscall_restart32(unsigned long orig_i0, struct pt_regs *regs, diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 1bfa854be602..68f9c8650af4 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -460,8 +460,7 @@ handle_signal(unsigned long signr, struct k_sigaction *ka, if (err) return; - block_sigmask(ka, signr); - tracehook_signal_handler(signr, info, ka, regs, 0); + signal_delivered(signr, info, ka, regs, 0); } static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs, diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 23b60caa6c43..867de2f8189c 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -473,8 +473,7 @@ static inline void handle_signal(unsigned long signr, struct k_sigaction *ka, if (err) return; - block_sigmask(ka, signr); - tracehook_signal_handler(signr, info, ka, regs, 0); + signal_delivered(signr, info, ka, regs, 0); } static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs, diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index e068aa0c6dfc..e29b0553211d 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -278,7 +278,7 @@ static void handle_signal(unsigned long sig, siginfo_t *info, ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret) return; - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 4ce6ab2d2996..7362d58efc29 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -61,7 +61,7 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr, if (err) force_sigsegv(signr, current); else - block_sigmask(ka, signr); + signal_delivered(signr, info, ka, regs, 0); } static int kern_do_signal(struct pt_regs *regs) diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index 4d9c4841989d..8adedb37720a 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -362,10 +362,7 @@ static void handle_signal(unsigned long sig, struct k_sigaction *ka, return; } - /* - * Block the signal if we were successful. - */ - block_sigmask(ka, sig); + signal_delivered(sig, info, ka, regs, 0); } /* diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 11e206f0f45a..e8a89374d356 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -715,10 +715,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - block_sigmask(ka, sig); - - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); + signal_delivered(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } #ifdef CONFIG_X86_32 diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 4da3c6f6d929..b9f8e5850d3a 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -493,7 +493,7 @@ static void do_signal(struct pt_regs *regs) if (ret) return; - block_sigmask(&ka, signr); + signal_delivered(signr, info, ka, regs, 0); if (current->ptrace & PT_SINGLESTEP) task_pt_regs(current)->icountlevel = 1; diff --git a/include/linux/signal.h b/include/linux/signal.h index 065e76330398..26b424adc842 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -256,7 +256,7 @@ extern int show_unhandled_signals; extern int sigsuspend(sigset_t *); extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); -extern void block_sigmask(struct k_sigaction *ka, int signr); +extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping); extern void exit_signals(struct task_struct *tsk); extern struct kmem_cache *sighand_cachep; diff --git a/kernel/signal.c b/kernel/signal.c index df8d721a9e6f..677102789cf2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2368,17 +2368,20 @@ relock: } /** - * block_sigmask - add @ka's signal mask to current->blocked - * @ka: action for @signr - * @signr: signal that has been successfully delivered + * signal_delivered - + * @sig: number of signal being delivered + * @info: siginfo_t of signal being delivered + * @ka: sigaction setting that chose the handler + * @regs: user register state + * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has succesfully been - * delivered. It adds the mask of signals for @ka to current->blocked - * so that they are blocked during the execution of the signal - * handler. In addition, @signr will be blocked unless %SA_NODEFER is - * set in @ka->sa.sa_flags. + * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * is always blocked, and the signal itself is blocked unless %SA_NODEFER + * is set in @ka->sa.sa_flags. Tracing is notified. */ -void block_sigmask(struct k_sigaction *ka, int signr) +void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, + struct pt_regs *regs, int stepping) { sigset_t blocked; @@ -2390,8 +2393,9 @@ void block_sigmask(struct k_sigaction *ka, int signr) sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, signr); + sigaddset(&blocked, sig); set_current_blocked(&blocked); + tracehook_signal_handler(sig, info, ka, regs, stepping); } /* -- cgit v1.2.3 From 44fbbb3dc687c9709a6f2236197316e5c79ab1eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Apr 2012 18:24:46 -0400 Subject: x86: get rid of calling do_notify_resume() when returning to kernel mode If we end up calling do_notify_resume() with !user_mode(refs), it does nothing (do_signal() explicitly bails out and we can't get there with TIF_NOTIFY_RESUME in such situations). Then we jump to resume_userspace_sig, which rechecks the same thing and bails out to resume_kernel, thus breaking the loop. It's easier and cheaper to check *before* calling do_notify_resume() and bail out to resume_kernel immediately. And kill the check in do_signal()... Note that on amd64 we can't get there with !user_mode() at all - asm glue takes care of that. Acked-and-reviewed-by: Thomas Gleixner Signed-off-by: Al Viro --- arch/x86/kernel/entry_32.S | 13 ++++++++++--- arch/x86/kernel/signal.c | 10 ---------- 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 01ccf9b71473..623f28837476 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -316,7 +316,6 @@ ret_from_exception: preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) -resume_userspace_sig: #ifdef CONFIG_VM86 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS movb PT_CS(%esp), %al @@ -615,9 +614,13 @@ work_notifysig: # deal with pending signals and # vm86-space TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel xorl %edx, %edx call do_notify_resume - jmp resume_userspace_sig + jmp resume_userspace ALIGN work_notifysig_v86: @@ -630,9 +633,13 @@ work_notifysig_v86: #endif TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel xorl %edx, %edx call do_notify_resume - jmp resume_userspace_sig + jmp resume_userspace END(work_pending) # perform syscall exit tracing diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e8a89374d356..21af737053aa 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -737,16 +737,6 @@ static void do_signal(struct pt_regs *regs) siginfo_t info; int signr; - /* - * We want the common case to go fast, which is why we may in certain - * cases get here from kernel mode. Just return without doing anything - * if so. - * X86_32: vm86 regs switched out by assembly code before reaching - * here, so testing against kernel CS suffices. - */ - if (!user_mode(regs)) - return; - signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { /* Whee! Actually deliver the signal. */ -- cgit v1.2.3 From bad1a753d4d4deb09d4bc0bac1dd4fc3298502e9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 21 May 2012 20:29:45 -0700 Subject: x86, x32, ptrace: Remove PTRACE_ARCH_PRCTL for x32 When I added x32 ptrace to 3.4 kernel, I also include PTRACE_ARCH_PRCTL support for x32 GDB For ARCH_GET_FS/GS, it takes a pointer to int64. But at user level, ARCH_GET_FS/GS takes a pointer to int32. So I have to add x32 ptrace to glibc to handle it with a temporary int64 passed to kernel and copy it back to GDB as int32. Roland suggested that PTRACE_ARCH_PRCTL is obsolete and x32 GDB should use fs_base and gs_base fields of user_regs_struct instead. Accordingly, remove PTRACE_ARCH_PRCTL completely from the x32 code to avoid possible memory overrun when pointer to int32 is passed to kernel. Link: http://lkml.kernel.org/r/CAMe9rOpDzHfS7NH7m1vmD9QRw8SSj4Sc%2BaNOgcWm_WJME2eRsQ@mail.gmail.com Signed-off-by: H. Peter Anvin Cc: v3.4 --- arch/x86/kernel/ptrace.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 13b1990c7c58..c4c6a5c2bf0f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child, 0, sizeof(struct user_i387_struct), datap); - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - return do_arch_prctl(child, data, addr); - default: return compat_ptrace_request(child, request, addr, data); } -- cgit v1.2.3 From 76eb9a30db4bc8fd172f9155247264b5f2686d7b Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 20 Feb 2012 14:20:06 +0800 Subject: ACPI, x86: fix Dell M6600 ACPI reboot regression via DMI Dell Precision M6600 is known to require PCI reboot, so add it to the reboot blacklist in pci_reboot_dmi_table[]. https://bugzilla.kernel.org/show_bug.cgi?id=42749 cc: x86@kernel.org Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af81604..412db5716d91 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -451,6 +451,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, { } }; -- cgit v1.2.3 From c2238f10e0c34a85a2a555c8a197316d1ca3fb7e Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Tue, 5 Jun 2012 10:35:02 +0800 Subject: x86/mce: Fix the MCE poll timer logic In commit 82f7af09 (x86/mce: Cleanup timer mess), Thomas just forgot the "/ 2" there while cleaning up. Signed-off-by: Chen Gong Acked-by: Thomas Gleixner Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 98003bfc5556..d6b18a4d0b95 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1266,7 +1266,7 @@ static void mce_timer_fn(unsigned long data) */ iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - iv = max(iv, (unsigned long) HZ/100); + iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); __this_cpu_write(mce_next_interval, iv); -- cgit v1.2.3 From 958fb3c51295764599d6abce87e1a01ace897a3e Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Tue, 5 Jun 2012 10:35:02 +0800 Subject: x86/mce: Fix the MCE poll timer logic In commit 82f7af09 ("x86/mce: Cleanup timer mess), Thomas just forgot the "/ 2" there while cleaning up. Signed-off-by: Chen Gong Acked-by: Thomas Gleixner Cc: bp@amd64.org Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/1338863702-9245-1-git-send-email-gong.chen@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0a687fd185e6..a97f3c4a3946 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1274,7 +1274,7 @@ static void mce_timer_fn(unsigned long data) */ iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - iv = max(iv, (unsigned long) HZ/100); + iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); __this_cpu_write(mce_next_interval, iv); -- cgit v1.2.3 From 436d03faf6961b30e13b2d0967aea9d772d6cf44 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 00:09:11 +0900 Subject: x86/decoder: Fix bsr/bsf/jmpe decoding with operand-size prefix Fix the x86 instruction decoder to decode bsr/bsf/jmpe with operand-size prefix (66h). This fixes the test case failure reported by Linus, attached below. bsf/bsr/jmpe have a special encoding. Opcode map in Intel Software Developers Manual vol2 says they have TZCNT/LZCNT variants if it has F3h prefix. However, there is no information if it has other 66h or F2h prefixes. Current instruction decoder supposes that those are bad instructions, but it actually accepts at least operand-size prefixes. H. Peter Anvin further explains: " TZCNT/LZCNT are F3 + BSF/BSR exactly because the F2 and F3 prefixes have historically been no-ops with most instructions. This allows software to unconditionally use the prefixed versions and get TZCNT/LZCNT on the processors that have them if they don't care about the difference. " This fixes errors reported by test_get_len: Warning: arch/x86/tools/test_get_len found difference at :ffffffff81036d87 Warning: ffffffff81036de5: 66 0f bc c2 bsf %dx,%ax Warning: objdump says 4 bytes, but insn_get_length() says 3 Warning: arch/x86/tools/test_get_len found difference at :ffffffff81036ea6 Warning: ffffffff81036f04: 66 0f bd c2 bsr %dx,%ax Warning: objdump says 4 bytes, but insn_get_length() says 3 Warning: decoded and checked 13298882 instructions with 2 warnings Reported-by: Linus Torvalds Reported-by: Pekka Enberg Signed-off-by: Masami Hiramatsu Cc: "H. Peter Anvin" Cc: Link: http://lkml.kernel.org/r/20120604150911.22338.43296.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 8 ++++---- arch/x86/tools/gen-insn-attr-x86.awk | 14 +++++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 819137904428..5d7e51f3fd28 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -28,7 +28,7 @@ # - (66): the last prefix is 0x66 # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 -# +# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) Table: one byte opcode Referrer: @@ -515,12 +515,12 @@ b4: LFS Gv,Mp b5: LGS Gv,Mp b6: MOVZX Gv,Eb b7: MOVZX Gv,Ew -b8: JMPE | POPCNT Gv,Ev (F3) +b8: JMPE (!F3) | POPCNT Gv,Ev (F3) b9: Grp10 (1A) ba: Grp8 Ev,Ib (1A) bb: BTC Ev,Gv -bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) -bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) +bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3) be: MOVSX Gv,Eb bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 5f6a5b6c3a15..ddcf39b1a18d 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -66,9 +66,10 @@ BEGIN { rex_expr = "^REX(\\.[XRWB]+)*" fpu_expr = "^ESC" # TODO - lprefix1_expr = "\\(66\\)" + lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\(F2\\)" + lprefix3_expr = "\\((F2|!F3)\\)" + lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 # All opcodes starting with lower-case 'v' or with (v1) superscript @@ -333,13 +334,16 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, lprefix1_expr)) { lptable1[idx] = add_flags(lptable1[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix2_expr)) { + } + if (match(ext, lprefix2_expr)) { lptable2[idx] = add_flags(lptable2[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix3_expr)) { + } + if (match(ext, lprefix3_expr)) { lptable3[idx] = add_flags(lptable3[idx],flags) variant = "INAT_VARIANT" - } else { + } + if (!match(ext, lprefix_expr)){ table[idx] = add_flags(table[idx],flags) } } -- cgit v1.2.3 From 1a87fc1ec7b05b9bc60df9dc52297d4c225d7f1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 6 Jun 2012 11:33:21 +0200 Subject: x86: mce: Add the dropped timer interval init back commit 82f7af09 ("x86/mce: Cleanup timer mess) dropped the initialization of the per cpu timer interval. Duh :( Restore the previous behaviour. Reported-by: Chen Gong Cc: bp@amd64.org Cc: tony.luck@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a97f3c4a3946..da27c5d2168a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1557,7 +1557,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); + unsigned long iv = check_interval * HZ; setup_timer(t, mce_timer_fn, smp_processor_id()); -- cgit v1.2.3 From aff5a62d52ff03956ff6992b9fe4b561fd855804 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 5 Jun 2012 15:00:31 -0400 Subject: x86/gart: Fix kmemleak warning aperture_64.c now is using memblock, the previous kmemleak_ignore() for alloc_bootmem() should be removed then. Otherwise, with kmemleak enabled, kernel will throw warnings like: [ 0.000000] kmemleak: Trying to color unknown object at 0xffff8800c4000000 as Black [ 0.000000] Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc1-next-20120605+ #130 [ 0.000000] Call Trace: [ 0.000000] [] paint_ptr+0x66/0xc0 [ 0.000000] [] kmemleak_ignore+0x2b/0x60 [ 0.000000] [] kmemleak_init+0x217/0x2c1 [ 0.000000] [] start_kernel+0x32d/0x3eb [ 0.000000] [] ? repair_env_string+0x5a/0x5a [ 0.000000] [] x86_64_start_reservations+0x131/0x135 [ 0.000000] [] ? early_idt_handlers+0x120/0x120 [ 0.000000] [] x86_64_start_kernel+0x102/0x111 [ 0.000000] kmemleak: Early log backtrace: [ 0.000000] [] kmemleak_ignore+0x4b/0x60 [ 0.000000] [] gart_iommu_hole_init+0x3e7/0x547 [ 0.000000] [] pci_iommu_alloc+0x44/0x6f [ 0.000000] [] mem_init+0x19/0xec [ 0.000000] [] start_kernel+0x1ea/0x3eb [ 0.000000] [] x86_64_start_reservations+0x131/0x135 [ 0.000000] [] x86_64_start_kernel+0x102/0x111 [ 0.000000] [] 0xffffffffffffffff Signed-off-by: Xiaotian Feng Cc: Xiaotian Feng Cc: Tejun Heo Link: http://lkml.kernel.org/r/1338922831-2847-1-git-send-email-xtfeng@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e76c191a835..d5fd66f0d4cd 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void) return 0; } memblock_reserve(addr, aper_size); - /* - * Kmemleak should not scan this block as it may not be mapped via the - * kernel direct mapping. - */ - kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, addr); insert_aperture_resource((u32)addr, aper_size); -- cgit v1.2.3 From 4af463d28f1a026e25c0b879fac2a0d2b7bff599 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 4 Jun 2012 11:42:32 +0900 Subject: x86/numa: Set numa_nodes_parsed at acpi_numa_memory_affinity_init() When hot-adding a CPU, the system outputs following messages since node_to_cpumask_map[2] was not allocated memory. Booting Node 2 Processor 32 APIC 0xc0 node_to_cpumask_map[2] NULL Pid: 0, comm: swapper/32 Tainted: G A 3.3.5-acd #21 Call Trace: [] debug_cpumask_set_cpu+0x155/0x160 [] ? add_timer_on+0xaa/0x120 [] numa_add_cpu+0x1e/0x22 [] identify_cpu+0x1df/0x1e4 [] identify_econdary_cpu+0x16/0x1d [] smp_store_cpu_info+0x3c/0x3e [] smp_callin+0x139/0x1be [] start_secondary+0x13/0xeb The reason is that the bit of node 2 was not set at numa_nodes_parsed. numa_nodes_parsed is set by only acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init. Thus even if hot-added memory which is same PXM as hot-added CPU is written in ACPI SRAT Table, if the hot-added CPU is not written in ACPI SRAT table, numa_nodes_parsed is not set. But according to ACPI Spec Rev 5.0, it says about ACPI SRAT table as follows: This optional table provides information that allows OSPM to associate processors and memory ranges, including ranges of memory provided by hot-added memory devices, with system localities / proximity domains and clock domains. It means that ACPI SRAT table only provides information for CPUs present at boot time and for memory including hot-added memory. So hot-added memory is written in ACPI SRAT table, but hot-added CPU is not written in it. Thus numa_nodes_parsed should be set by not only acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init but also acpi_numa_memory_affinity_init for the case. Additionally, if system has cpuless memory node, acpi_numa_processor_affinity_init / acpi_numa_x2apic_affinity_init cannot set numa_nodes_parseds since these functions cannot find cpu description for the node. In this case, numa_nodes_parsed needs to be set by acpi_numa_memory_affinity_init. Signed-off-by: Yasuaki Ishimatsu Acked-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: liuj97@gmail.com Cc: kosaki.motohiro@gmail.com Link: http://lkml.kernel.org/r/4FCC2098.4030007@jp.fujitsu.com [ merged it ] Signed-off-by: Ingo Molnar --- arch/x86/mm/srat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 732af3a96183..4599c3e8bcb6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -176,6 +176,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); -- cgit v1.2.3 From 7071f6b2889bb41bea61891d8a3e6e70517ef5e6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 31 May 2012 23:20:25 +0200 Subject: x86/intel/moorestown: Change intel_scu_devices_create() to __devinit The allmodconfig hits: WARNING: vmlinux.o(.text+0x6553d): Section mismatch in reference from the function intel_scu_devices_create() to the function .devinit.text: spi_register_board_info() [...] This patch marks intel_scu_devices_create() as devinit because it only calls a devinit function, spi_register_board_info(). Signed-off-by: Sebastian Andrzej Siewior Cc: Alan Cox Cc: Kirill A. Shutemov Cc: Mika Westerberg Cc: Samuel Ortiz Cc: Feng Tang Link: http://lkml.kernel.org/r/20120531212025.GA8519@breakpoint.cc Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index e31bcd8f2eee..fd41a9262d65 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -782,7 +782,7 @@ BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); EXPORT_SYMBOL_GPL(intel_scu_notifier); /* Called by IPC driver */ -void intel_scu_devices_create(void) +void __devinit intel_scu_devices_create(void) { int i; -- cgit v1.2.3 From 55c844a4dd16a4d1fdc0cf2a283ec631a02ec448 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 30 May 2012 23:15:41 +0800 Subject: x86/reboot: Fix a warning message triggered by stop_other_cpus() When rebooting our 24 CPU Westmere servers with 3.4-rc6, we always see this warning msg: Restarting system. machine restart ------------[ cut here ]------------ WARNING: at arch/x86/kernel/smp.c:125 native_smp_send_reschedule+0x74/0xa7() Hardware name: X8DTN Modules linked in: igb [last unloaded: scsi_wait_scan] Pid: 1, comm: systemd-shutdow Not tainted 3.4.0-rc6+ #22 Call Trace: [] warn_slowpath_common+0x7e/0x96 [] warn_slowpath_null+0x15/0x17 [] native_smp_send_reschedule+0x74/0xa7 [] trigger_load_balance+0x279/0x2a6 [] scheduler_tick+0xe0/0xe9 [] update_process_times+0x60/0x70 [] tick_sched_timer+0x68/0x92 [] __run_hrtimer+0xb3/0x13c [] ? tick_nohz_handler+0xd0/0xd0 [] hrtimer_interrupt+0xdb/0x198 [] smp_apic_timer_interrupt+0x81/0x94 [] apic_timer_interrupt+0x67/0x70 [] ? default_send_IPI_mask_allbutself_phys+0xb4/0xc4 [] physflat_send_IPI_allbutself+0x12/0x14 [] native_nmi_stop_other_cpus+0x8a/0xd6 [] native_machine_shutdown+0x50/0x67 [] machine_shutdown+0xa/0xc [] native_machine_restart+0x20/0x32 [] machine_restart+0xa/0xc [] kernel_restart+0x47/0x4c [] sys_reboot+0x13e/0x17c [] ? _raw_spin_unlock_bh+0x10/0x12 [] ? bdi_queue_work+0xcf/0xd8 [] ? __bdi_start_writeback+0xae/0xb7 [] ? iterate_supers+0xa3/0xb7 [] system_call_fastpath+0x16/0x1b ---[ end trace 320af5cb1cb60c5b ]--- The root cause seems to be the default_send_IPI_mask_allbutself_phys() takes quite some time (I measured it could be several ms) to complete sending NMIs to all the other 23 CPUs, and for HZ=250/1000 system, the time is long enough for a timer interrupt to happen, which will in turn trigger to kick load balance to a stopped CPU and cause this warning in native_smp_send_reschedule(). So disabling the local irq before stop_other_cpu() can fix this problem (tested 25 times reboot ok), and it is fine as there should be nobody caring the timer interrupt in such reboot stage. The latest 3.4 kernel slightly changes this behavior by sending REBOOT_VECTOR first and only send NMI_VECTOR if the REBOOT_VCTOR fails, and this patch is still needed to prevent the problem. Signed-off-by: Feng Tang Acked-by: Don Zickus Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120530231541.4c13433a@feng-i7 Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af81604..25b48edb847c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -639,9 +639,11 @@ void native_machine_shutdown(void) set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); /* - * O.K Now that I'm on the appropriate processor, - * stop all of the others. + * O.K Now that I'm on the appropriate processor, stop all of the + * others. Also disable the local irq to not receive the per-cpu + * timer interrupt which may trigger scheduler's load balance. */ + local_irq_disable(); stop_other_cpus(); #endif -- cgit v1.2.3 From f6175f5bfb4c9f2ed32758c95f765b529b1a7f15 Mon Sep 17 00:00:00 2001 From: Tomoki Sekiyama Date: Mon, 28 May 2012 18:09:18 +0900 Subject: x86/ioapic: Fix NULL pointer dereference on CPU hotplug after disabling irqs In current Linux, percpu variable `vector_irq' is not cleared on offlined cpus while disabling devices' irqs. If the cpu that has the disabled irqs in vector_irq is hotplugged, __setup_vector_irq() hits invalid irq vector and may crash. This bug can be reproduced as following; # echo 0 > /sys/devices/system/cpu/cpu7/online # modprobe -r some_driver_using_interrupts # vector_irq@cpu7 uncleared # echo 1 > /sys/devices/system/cpu/cpu7/online # kernel may crash This patch fixes this bug by clearing vector_irq in __clear_irq_vector() even if the cpu is offlined. Signed-off-by: Tomoki Sekiyama Acked-by: Thomas Gleixner Cc: yrl.pp-manager.tt@hitachi.com Cc: ltc-kernel@ml.yrl.intra.hitachi.co.jp Cc: Suresh Siddha Cc: Yinghai Lu Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/4FC340BE.7080101@hitachi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ac96561d1a99..5f0ff597437c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1195,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) BUG_ON(!cfg->vector); vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + for_each_cpu(cpu, cfg->domain) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; @@ -1203,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu(cpu, cfg->old_domain) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) -- cgit v1.2.3 From ceb1cbac8eda66cf0f889def226b4e82f8ff857b Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Thu, 31 May 2012 13:07:38 +0530 Subject: sched/x86: Calculate booted cores after construction of sibling_mask Commit 316ad248307fb ("sched/x86: Rewrite set_cpu_sibling_map()") broke the booted_cores accounting. The problem is that the booted_cores accounting needs all the sibling links set up. So restore the second loop and add a comment as to why its needed. On qemu booted with -smp sockets=1,cores=2,threads=2; Before: $ grep cores /proc/cpuinfo cpu cores : 2 cpu cores : 1 cpu cores : 4 cpu cores : 3 With the patch: $ grep cores /proc/cpuinfo cpu cores : 2 cpu cores : 2 cpu cores : 2 cpu cores : 2 Reported-by: Prarit Bhargava Reported-by: Borislav Petkov Signed-off-by: Kamalesh Babulal Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120531073738.GH7511@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd019d78b1f4..3fab55bea29b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -382,6 +382,15 @@ void __cpuinit set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mc && match_llc(c, o))) link_mask(llc_shared, cpu, i); + } + + /* + * This needs a separate iteration over the cpus because we rely on all + * cpu_sibling_mask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + if ((i == cpu) || (has_mc && match_mc(c, o))) { link_mask(core, cpu, i); -- cgit v1.2.3 From b430f7c4706aeba4270c7ab7744fc504b9315e1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 15:30:31 +0200 Subject: perf/x86: Fix Intel shared extra MSR allocation Zheng Yan reported that event group validation can wreck event state when Intel extra_reg allocation changes event state. Validation shouldn't change any persistent state. Cloning events in validate_{event,group}() isn't really pretty either, so add a few special cases to avoid modifying the event state. The code is restructured to minimize the special case impact. Reported-by: Zheng Yan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338903031.28282.175.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 92 +++++++++++++++++++++++----------- 3 files changed, 66 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da0183..cb608383e4f6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1496,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6638aaf54493..83794d8e6af0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -117,6 +117,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ unsigned int group_flag; + int is_fake; /* * Intel DebugStore bits diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6aef..965baa2fa790 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1119,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +static int intel_alt_er(int idx) { if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) - return false; + return idx; - if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; - event->hw.extra_reg.idx = EXTRA_REG_RSP_1; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; event->hw.config |= 0x01b7; - event->hw.extra_reg.idx = EXTRA_REG_RSP_0; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } - - if (event->hw.extra_reg.idx == orig_idx) - return false; - - return true; } /* @@ -1157,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct event_constraint *c = &emptyconstraint; struct er_account *era; unsigned long flags; - int orig_idx = reg->idx; + int idx = reg->idx; - /* already allocated shared msr */ - if (reg->alloc) + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */ again: - era = &cpuc->shared_regs->regs[reg->idx]; + era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc @@ -1173,6 +1183,29 @@ again: if (!atomic_read(&era->ref) || era->config == reg->config) { + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + /* lock in msr value */ era->config = reg->config; era->reg = reg->reg; @@ -1180,17 +1213,17 @@ again: /* one more user */ atomic_inc(&era->ref); - /* no need to reallocate during incremental event scheduling */ - reg->alloc = 1; - /* * need to call x86_get_event_constraint() * to check if associated event has constraints */ c = NULL; - } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1204,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, struct er_account *era; /* - * only put constraint if extra reg was actually - * allocated. Also takes care of event which do - * not use an extra shared reg + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. */ - if (!reg->alloc) + if (!reg->alloc || cpuc->is_fake) return; era = &cpuc->shared_regs->regs[reg->idx]; -- cgit v1.2.3 From cccb9ba9e4ee0d750265f53de9258df69655c40b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: perf/x86: Implement cycles:p for SNB/IVB Now that there's finally a chip with working PEBS (IvyBridge), we can enable the hardware and implement cycles:p for SNB/IVB. Cc: Stephane Eranian Requested-and-tested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 50 ++++++++++++++++++++++++++++------ 2 files changed, 43 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 83794d8e6af0..7241e2fc3c17 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -365,6 +365,7 @@ struct x86_pmu { int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); /* * Intel LBR diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 965baa2fa790..2312c1ff1b19 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1336,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_shared_regs_event_constraints(cpuc, event); } -static int intel_pmu_hw_config(struct perf_event *event) +static void intel_pebs_aliases_core2(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip && - (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. @@ -1365,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event) */ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); if (intel_pmu_needs_lbr_smpl(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -1643,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, @@ -1885,6 +1918,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; -- cgit v1.2.3 From b6db437ba8322f5cee0bd355ad2ef9f73c413754 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: perf/x86: Enable/Add IvyBridge hardware support Implement rudimentary IVB perf support. The SDM states its identical to SNB with exception of the exact event tables, but a quick look suggests they're similar enough. Also mark SNB-EP as broken for now. Requested-and-tested-by: Linus Torvalds Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2312c1ff1b19..187c294bc658 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1909,8 +1909,9 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.2.3 From 8440ccb43fc0ecffcf1acee0273d766e6a8cd51d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Jun 2012 10:26:43 +0200 Subject: perf/x86: Update SNB PEBS constraints Afaict there's no need to (incompletely) iterate the MEM_UOPS_RETIRED.* umask state. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1338884803.28282.153.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 5a3edc27f6e5..35e2192df9f4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ -- cgit v1.2.3 From 302fa4b58ac754a6da13f4f5546f710fecc3b945 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:33 -0700 Subject: perf/x86: Allow multiple stacks Without this patch, applications with two different stack regions (eg: native stack vs JIT stack) get truncated callchains even when RBP chaining is present. GDB shows proper stack traces and the frame pointer chaining is intact. This patch disables the (fp < RSP) check, hoping that other checks in the code save the day for us. In our limited testing, this didn't seem to break anything. In the long term, we could potentially have userspace advise the kernel on the range of valid stack addresses, so we don't spend a lot of time unwinding from bogus addresses. Signed-off-by: Arun Sharma CC: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Stephane Eranian Cc: Namhyung Kim Cc: Tom Zanussi Cc: linux-kernel@vger.kernel.org Cc: linux-perf-users@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-2-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cb608383e4f6..e78bc256aea8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1781,9 +1781,6 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; - if (fp < compat_ptr(regs->sp)) - break; - perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } @@ -1827,9 +1824,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; - if ((unsigned long)fp < regs->sp) - break; - perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } -- cgit v1.2.3 From bc6ca7b342d5ae15c3ba3081fd40271b8039fb25 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:35 -0700 Subject: perf/x86: Check if user fp is valid Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-4-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 12 ++++++------ arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 04cd6882308e..e1f3a17034fc 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -33,9 +33,8 @@ #define segment_eq(a, b) ((a).seg == (b).seg) #define user_addr_max() (current_thread_info()->addr_limit.seg) -#define __addr_ok(addr) \ - ((unsigned long __force)(addr) < \ - (current_thread_info()->addr_limit.seg)) +#define __addr_ok(addr) \ + ((unsigned long __force)(addr) < user_addr_max()) /* * Test whether a block of memory is a valid user space address. @@ -47,14 +46,14 @@ * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ -#define __range_not_ok(addr, size) \ +#define __range_not_ok(addr, size, limit) \ ({ \ unsigned long flag, roksum; \ __chk_user_ptr(addr); \ asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ : "=&r" (flag), "=r" (roksum) \ : "1" (addr), "g" ((long)(size)), \ - "rm" (current_thread_info()->addr_limit.seg)); \ + "rm" (limit)); \ flag; \ }) @@ -77,7 +76,8 @@ * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) +#define access_ok(type, addr, size) \ + (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) /* * The exception table consists of pairs of addresses relative to the diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e78bc256aea8..c4706cf9c011 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1757,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + #ifdef CONFIG_COMPAT #include @@ -1781,6 +1787,9 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; + if (!valid_user_frame(fp, sizeof(frame))) + break; + perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } @@ -1824,6 +1833,9 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; + if (!valid_user_frame(fp, sizeof(frame))) + break; + perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } -- cgit v1.2.3 From db0dc75d6403b6663c0eab4c6ccb672eb9b2ed72 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:36 -0700 Subject: perf/x86: Check user address explicitly in copy_from_user_nmi() Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-5-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- arch/x86/lib/usercopy.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index f61ee67ec00f..677b1ed184c9 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -8,6 +8,7 @@ #include #include +#include /* * best effort, GUP based copy_from_user() that is NMI-safe @@ -21,6 +22,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) void *map; int ret; + if (__range_not_ok(from, n, TASK_SIZE) == 0) + return len; + do { ret = __get_user_pages_fast(addr, 1, 0, &page); if (!ret) -- cgit v1.2.3 From 743628e868c5992354fc80b4d1e9a6143da1c0e6 Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Thu, 7 Jun 2012 09:05:21 -0700 Subject: x86, efi stub: Add .reloc section back into image Some UEFI firmware will not load a .efi with a .reloc section with a size of 0. Therefore, we create a .efi image with 4 main areas and 3 sections. 1. PE/COFF file header 2. .setup section (covers all setup code following the first sector) 3. .reloc section (contains 1 dummy reloc entry, created in build.c) 4. .text section (covers the remaining kernel image) To make room for the new .setup section data, the header bugger_off_msg had to be shortened. Reported-by: Henrik Rydberg Signed-off-by: Jordan Justen Link: http://lkml.kernel.org/r/1339085121-12760-1-git-send-email-jordan.l.justen@intel.com Tested-by: Lee G Rosenbaum Tested-by: Henrik Rydberg Cc: Matt Fleming Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 42 ++++++++--- arch/x86/boot/tools/build.c | 172 ++++++++++++++++++++++++++++---------------- 2 files changed, 140 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8bbea6aa40d9..efe5acfc79c3 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -94,10 +94,10 @@ bs_die: .section ".bsdata", "a" bugger_off_msg: - .ascii "Direct booting from floppy is no longer supported.\r\n" - .ascii "Please use a boot loader program instead.\r\n" + .ascii "Direct floppy boot is not supported. " + .ascii "Use a boot loader program instead.\r\n" .ascii "\n" - .ascii "Remove disk and press any key to reboot . . .\r\n" + .ascii "Remove disk and press any key to reboot ...\r\n" .byte 0 #ifdef CONFIG_EFI_STUB @@ -111,7 +111,7 @@ coff_header: #else .word 0x8664 # x86-64 #endif - .word 2 # nr_sections + .word 3 # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols @@ -158,8 +158,8 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long 0x1000 # SectionAlignment - .long 0x200 # FileAlignment + .long 0x20 # SectionAlignment + .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion .word 0 # MajorImageVersion @@ -200,8 +200,10 @@ extra_header_fields: # Section table section_table: - .ascii ".text" - .byte 0 + # + # The offset & size fields are filled in by build.c. + # + .ascii ".setup" .byte 0 .byte 0 .long 0 @@ -217,9 +219,8 @@ section_table: # # The EFI application loader requires a relocation section - # because EFI applications must be relocatable. But since - # we don't need the loader to fixup any relocs for us, we - # just create an empty (zero-length) .reloc section header. + # because EFI applications must be relocatable. The .reloc + # offset & size fields are filled in by build.c. # .ascii ".reloc" .byte 0 @@ -233,6 +234,25 @@ section_table: .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers .long 0x42100040 # Characteristics (section flags) + + # + # The offset & size fields are filled in by build.c. + # + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 # startup_{32,64} + .long 0 # Size of initialized data + # on disk + .long 0x0 # startup_{32,64} + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x60500020 # Characteristics (section flags) + #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 3f61f6e2b46f..4b8e165ee572 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -50,6 +50,8 @@ typedef unsigned int u32; u8 buf[SETUP_SECT_MAX*512]; int is_big_kernel; +#define PECOFF_RELOC_RESERVE 0x20 + /*----------------------------------------------------------------------*/ static const u32 crctab32[] = { @@ -133,11 +135,103 @@ static void usage(void) die("Usage: build setup system [> image]"); } -int main(int argc, char ** argv) -{ #ifdef CONFIG_EFI_STUB - unsigned int file_sz, pe_header; + +static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +{ + unsigned int pe_header; + unsigned short num_sections; + u8 *section; + + pe_header = get_unaligned_le32(&buf[0x3c]); + num_sections = get_unaligned_le16(&buf[pe_header + 6]); + +#ifdef CONFIG_X86_32 + section = &buf[pe_header + 0xa8]; +#else + section = &buf[pe_header + 0xb8]; #endif + + while (num_sections > 0) { + if (strncmp((char*)section, section_name, 8) == 0) { + /* section header size field */ + put_unaligned_le32(size, section + 0x8); + + /* section header vma field */ + put_unaligned_le32(offset, section + 0xc); + + /* section header 'size of initialised data' field */ + put_unaligned_le32(size, section + 0x10); + + /* section header 'file offset' field */ + put_unaligned_le32(offset, section + 0x14); + + break; + } + section += 0x28; + num_sections--; + } +} + +static void update_pecoff_setup_and_reloc(unsigned int size) +{ + u32 setup_offset = 0x200; + u32 reloc_offset = size - PECOFF_RELOC_RESERVE; + u32 setup_size = reloc_offset - setup_offset; + + update_pecoff_section_header(".setup", setup_offset, setup_size); + update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE); + + /* + * Modify .reloc section contents with a single entry. The + * relocation is applied to offset 10 of the relocation section. + */ + put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]); + put_unaligned_le32(10, &buf[reloc_offset + 4]); +} + +static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) +{ + unsigned int pe_header; + unsigned int text_sz = file_sz - text_start; + + pe_header = get_unaligned_le32(&buf[0x3c]); + + /* Size of image */ + put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); + + /* + * Size of code: Subtract the size of the first sector (512 bytes) + * which includes the header. + */ + put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]); + +#ifdef CONFIG_X86_32 + /* + * Address of entry point. + * + * The EFI stub entry point is +16 bytes from the start of + * the .text section. + */ + put_unaligned_le32(text_start + 16, &buf[pe_header + 0x28]); +#else + /* + * Address of entry point. startup_32 is at the beginning and + * the 64-bit entry point (startup_64) is always 512 bytes + * after. The EFI stub entry point is 16 bytes after that, as + * the first instruction allows legacy loaders to jump over + * the EFI stub initialisation + */ + put_unaligned_le32(text_start + 528, &buf[pe_header + 0x28]); +#endif /* CONFIG_X86_32 */ + + update_pecoff_section_header(".text", text_start, text_sz); +} + +#endif /* CONFIG_EFI_STUB */ + +int main(int argc, char ** argv) +{ unsigned int i, sz, setup_sectors; int c; u32 sys_size; @@ -163,6 +257,12 @@ int main(int argc, char ** argv) die("Boot block hasn't got boot flag (0xAA55)"); fclose(file); +#ifdef CONFIG_EFI_STUB + /* Reserve 0x20 bytes for .reloc section */ + memset(buf+c, 0, PECOFF_RELOC_RESERVE); + c += PECOFF_RELOC_RESERVE; +#endif + /* Pad unused space with zeros */ setup_sectors = (c + 511) / 512; if (setup_sectors < SETUP_SECT_MIN) @@ -170,6 +270,10 @@ int main(int argc, char ** argv) i = setup_sectors*512; memset(buf+c, 0, i-c); +#ifdef CONFIG_EFI_STUB + update_pecoff_setup_and_reloc(i); +#endif + /* Set the default root device */ put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); @@ -194,66 +298,8 @@ int main(int argc, char ** argv) put_unaligned_le32(sys_size, &buf[0x1f4]); #ifdef CONFIG_EFI_STUB - file_sz = sz + i + ((sys_size * 16) - sz); - - pe_header = get_unaligned_le32(&buf[0x3c]); - - /* Size of image */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); - - /* - * Subtract the size of the first section (512 bytes) which - * includes the header and .reloc section. The remaining size - * is that of the .text section. - */ - file_sz -= 512; - - /* Size of code */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]); - -#ifdef CONFIG_X86_32 - /* - * Address of entry point. - * - * The EFI stub entry point is +16 bytes from the start of - * the .text section. - */ - put_unaligned_le32(i + 16, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xb4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xbc]); -#else - /* - * Address of entry point. startup_32 is at the beginning and - * the 64-bit entry point (startup_64) is always 512 bytes - * after. The EFI stub entry point is 16 bytes after that, as - * the first instruction allows legacy loaders to jump over - * the EFI stub initialisation - */ - put_unaligned_le32(i + 528, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xc4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xcc]); -#endif /* CONFIG_X86_32 */ -#endif /* CONFIG_EFI_STUB */ + update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); +#endif crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, stdout) != i) -- cgit v1.2.3 From bd2753b2dda7bb43c7468826de75f49c6a7e8965 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 Jun 2012 10:55:40 -0700 Subject: x86/mm: Only add extra pages count for the first memory range during pre-allocation early page table space Robin found this regression: | I just tried to boot an 8TB system. It fails very early in boot with: | Kernel panic - not syncing: Cannot find space for the kernel page tables git bisect commit 722bc6b16771ed80871e1fd81c86d3627dda2ac8. A git revert of that commit does boot past that point on the 8TB configuration. That commit will add up extra pages for all memory range even above 4g. Try to limit that extra page count adding to first entry only. Bisected-by: Robin Holt Tested-by: Robin Holt Signed-off-by: Yinghai Lu Cc: WANG Cong Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/CAE9FiQUj3wyzQxtq9yzBNc9u220p8JZ1FYHG7t%3DMOzJ%3D9BZMYA@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 97141c26a13a..bc4e9d84157f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -62,7 +62,8 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en extra += PMD_SIZE; #endif /* The first 2/4M doesn't use large pages. */ - extra += mr->end - mr->start; + if (mr->start < PMD_SIZE) + extra += mr->end - mr->start; ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else -- cgit v1.2.3 From d5d2d2eea84b0d8450b082edbc3dbde41fb8bfd8 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 7 Jun 2012 08:31:40 -0500 Subject: x86/uv: Fix UV2 BAU legacy mode The SGI Altix UV2 BAU (Broadcast Assist Unit) as used for tlb-shootdown (selective broadcast mode) always uses UV2 broadcast descriptor format. There is no need to clear the 'legacy' (UV1) mode, because the hardware always uses UV2 mode for selective broadcast. But the BIOS uses general broadcast and legacy mode, and the hardware pays attention to the legacy mode bit for general broadcast. So the kernel must not clear that mode bit. Signed-off-by: Cliff Wickman Cc: Link: http://lkml.kernel.org/r/E1SccoO-0002Lh-Cb@eag09.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_bau.h | 1 - arch/x86/platform/uv/tlb_uv.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index becf47b81735..6149b476d9df 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -149,7 +149,6 @@ /* 4 bits of software ack period */ #define UV2_ACK_MASK 0x7UL #define UV2_ACK_UNITS_SHFT 3 -#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT #define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT /* diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3ae0e61abd23..59880afa851f 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1295,7 +1295,6 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image &= ~(1L << UV2_LEG_SHFT); mmr_image |= (1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); -- cgit v1.2.3 From eeaaa96a3a2134a174100afd129bb0891d05f4b2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 6 Jun 2012 10:05:42 -0400 Subject: x86/nmi: Fix section mismatch warnings on 32-bit It was reported that compiling for 32-bit caused a bunch of section mismatch warnings: VDSOSYM arch/x86/vdso/vdso32-syms.lds LD arch/x86/vdso/built-in.o LD arch/x86/built-in.o WARNING: arch/x86/built-in.o(.data+0x5af0): Section mismatch in reference from the variable test_nmi_ipi_callback_na.10451 to the function .init.text:test_nmi_ipi_callback() [...] WARNING: arch/x86/built-in.o(.data+0x5b04): Section mismatch in reference from the variable nmi_unk_cb_na.10399 to the function .init.text:nmi_unk_cb() The variable nmi_unk_cb_na.10399 references the function __init nmi_unk_cb() [...] Both of these are attributed to the internal representation of the nmiaction struct created during register_nmi_handler. The reason for this is that those structs are not defined in the init section whereas the rest of the code in nmi_selftest.c is. To resolve this, I created a new #define, register_nmi_handler_initonly, that tags the struct as __initdata to resolve the mismatch. This #define should only be used in rare situations where the register/unregister is called during init of the kernel. Big thanks to Jan Beulich for decoding this for me as I didn't have a clue what was going on. Reported-by: Witold Baryluk Tested-by: Witold Baryluk Cc: Jan Beulich Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1338991542-23000-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nmi.h | 14 ++++++++++++++ arch/x86/kernel/nmi_selftest.c | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 0e3793b821ef..dc580c42851c 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -54,6 +54,20 @@ struct nmiaction { __register_nmi_handler((t), &fn##_na); \ }) +/* + * For special handlers that register/unregister in the + * init section only. This should be considered rare. + */ +#define register_nmi_handler_initonly(t, fn, fg, n) \ +({ \ + static struct nmiaction fn##_na __initdata = { \ + .handler = (fn), \ + .name = (n), \ + .flags = (fg), \ + }; \ + __register_nmi_handler((t), &fn##_na); \ +}) + int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e31bf8d5c4d2..149b8d9c6ad4 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); } static void __init cleanup_nmi_testsuite(void) @@ -64,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, NMI_FLAG_FIRST, "nmi_selftest")) { nmi_fail = FAILURE; return; -- cgit v1.2.3 From ae10ccdc3093486f8c2369d227583f9d79f628e5 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 4 Jun 2012 15:00:04 +0800 Subject: ACPI: Make acpi_skip_timer_override cover all source_irq==0 cases Currently when acpi_skip_timer_override is set, it only cover the (source_irq == 0 && global_irq == 2) cases. While there is also platform which need use this option and its global_irq is not 2. This patch will extend acpi_skip_timer_override to cover all timer overriding cases as long as the source irq is 0. This is the first part of a fix to kernel bug bugzilla 40002: "IRQ 0 assigned to VGA" https://bugzilla.kernel.org/show_bug.cgi?id=40002 Reported-and-tested-by: Szymon Kowalczyk Signed-off-by: Feng Tang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8afb69319815..e7c698e9c7ec 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, return 0; } - if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { + if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + printk(PREFIX "BIOS IRQ0 override ignored.\n"); return 0; } - if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } @@ -1334,7 +1336,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) } /* - * Force ignoring BIOS IRQ0 pin2 override + * Force ignoring BIOS IRQ0 override */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { @@ -1344,7 +1346,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) */ if (!acpi_skip_timer_override) { WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", d->ident); acpi_skip_timer_override = 1; } @@ -1438,7 +1440,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * is enabled. This input is incorrectly designated the * ISA IRQ 0 via an interrupt source override even though * it is wired to the output of the master 8259A and INTIN0 - * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * is not connected at all. Force ignoring BIOS IRQ0 * override in that cases. */ { -- cgit v1.2.3 From 7f68b4c2e158019c2ec494b5cfbd9c83b4e5b253 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 4 Jun 2012 15:00:05 +0800 Subject: ACPI: Remove one board specific WARN when ignoring timer overriding Current WARN msg is only for the ati_ixp4x0 board, while this function is used by mulitple platforms. So this one board specific warning is not appropriate any more. Signed-off-by: Feng Tang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e7c698e9c7ec..3a6afba6eca7 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1340,12 +1340,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - /* - * The ati_ixp4x0_rev() early PCI quirk should have set - * the acpi_skip_timer_override flag already: - */ if (!acpi_skip_timer_override) { - WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", d->ident); acpi_skip_timer_override = 1; -- cgit v1.2.3 From f6b54f083cc66cf9b11d2120d8df3c2ad4e0836d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 4 Jun 2012 15:00:06 +0800 Subject: ACPI: Add a quirk for "AMILO PRO V2030" to ignore the timer overriding This is the 2nd part of fix for kernel bugzilla 40002: "IRQ 0 assigned to VGA" https://bugzilla.kernel.org/show_bug.cgi?id=40002 The root cause is the buggy FW, whose ACPI tables assign the GSI 16 to 2 irqs 0 and 16(VGA), and the VGA is the right owner of GSI 16. So add a quirk to ignore the irq0 overriding GSI 16 for the FUJITSU SIEMENS AMILO PRO V2030 platform will solve this issue. Reported-and-tested-by: Szymon Kowalczyk Signed-off-by: Feng Tang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3a6afba6eca7..b2297e58c6ed 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1470,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, {} }; -- cgit v1.2.3